diff options
author | Ingo Molnar <mingo@kernel.org> | 2018-04-05 09:20:34 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-04-05 09:20:34 +0200 |
commit | ea2a6af517714c52a1209795a03e863e96b460bb (patch) | |
tree | 3bd443bc9b23ceeaf3743eaf2d6d35ec63c620c9 /kernel | |
parent | 1b5d43cfb69759d8ef8d30469cea31d0c037aed5 (diff) | |
parent | 642e7fd23353e22290e3d51719fcb658dc252342 (diff) | |
download | op-kernel-dev-ea2a6af517714c52a1209795a03e863e96b460bb.zip op-kernel-dev-ea2a6af517714c52a1209795a03e863e96b460bb.tar.gz |
Merge branch 'linus' into sched/urgent, to pick up fixes and updates
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
71 files changed, 3840 insertions, 2396 deletions
diff --git a/kernel/compat.c b/kernel/compat.c index 3f5fa89..6d21894 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,61 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -#ifdef CONFIG_NUMA -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages32, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); -} - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return sys_migrate_pages(pid, nr_bits + 1, old, new); -} -#endif - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/cpu.c b/kernel/cpu.c index 53f7dc6..0db8938 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -124,24 +124,11 @@ struct cpuhp_step { }; static DEFINE_MUTEX(cpuhp_state_mutex); -static struct cpuhp_step cpuhp_bp_states[]; -static struct cpuhp_step cpuhp_ap_states[]; - -static bool cpuhp_is_ap_state(enum cpuhp_state state) -{ - /* - * The extra check for CPUHP_TEARDOWN_CPU is only for documentation - * purposes as that state is handled explicitly in cpu_down. - */ - return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; -} +static struct cpuhp_step cpuhp_hp_states[]; static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { - struct cpuhp_step *sp; - - sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; - return sp + state; + return cpuhp_hp_states + state; } /** @@ -239,6 +226,15 @@ err: } #ifdef CONFIG_SMP +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitly in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) { struct completion *done = bringup ? &st->done_up : &st->done_down; @@ -1224,7 +1220,7 @@ int __boot_cpu_id; #endif /* CONFIG_SMP */ /* Boot processor state steps */ -static struct cpuhp_step cpuhp_bp_states[] = { +static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", .startup.single = NULL, @@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - /* - * Handled on controll processor until the plugged processor manages - * this itself. - */ - [CPUHP_TEARDOWN_CPU] = { - .name = "cpu:teardown", - .startup.single = NULL, - .teardown.single = takedown_cpu, - .cant_stop = true, - }, -#else - [CPUHP_BRINGUP_CPU] = { }, -#endif -}; - -/* Application processor state steps */ -static struct cpuhp_step cpuhp_ap_states[] = { -#ifdef CONFIG_SMP /* Final state before CPU kills itself */ [CPUHP_AP_IDLE_DEAD] = { .name = "idle:dead", @@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { [CPUHP_AP_ONLINE] = { .name = "ap:online", }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup.single = NULL, + .teardown.single = takedown_cpu, + .cant_stop = true, + }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", @@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state) switch (state) { case CPUHP_AP_ONLINE_DYN: - step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; + step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN; end = CPUHP_AP_ONLINE_DYN_END; break; case CPUHP_BP_PREPARE_DYN: - step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; + step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN; end = CPUHP_BP_PREPARE_DYN_END; break; default: diff --git a/kernel/events/core.c b/kernel/events/core.c index 709a55b..fc1c330 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) + for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } @@ -948,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event, if (!is_cgroup_event(event)) return; - if (add && ctx->nr_cgroups++) - return; - else if (!add && --ctx->nr_cgroups) - return; /* * Because cgroup events are always per-cpu events, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; - /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ - if (add) { + + /* + * Since setting cpuctx->cgrp is conditional on the current @cgrp + * matching the event's cgroup, we must do this for every new event, + * because if the first would mismatch, the second would not try again + * and we would leave cpuctx->cgrp unset. + */ + if (add && !cpuctx->cgrp) { struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) cpuctx->cgrp = cgrp; - } else { - list_del(cpuctx_entry); - cpuctx->cgrp = NULL; } + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + + /* no cgroup running */ + if (!add) + cpuctx->cgrp = NULL; + + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + if (add) + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + else + list_del(cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1052,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event, static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - int rotations = 0; + bool rotations; lockdep_assert_irqs_disabled(); @@ -1471,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event) return event_type; } -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +/* + * Helper function to initialize event group nodes. + */ +static void init_event_group(struct perf_event *event) +{ + RB_CLEAR_NODE(&event->group_node); + event->group_index = 0; +} + +/* + * Extract pinned or flexible groups from the context + * based on event attrs bits. + */ +static struct perf_event_groups * +get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; @@ -1481,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) } /* + * Helper function to initializes perf_event_group trees. + */ +static void perf_event_groups_init(struct perf_event_groups *groups) +{ + groups->tree = RB_ROOT; + groups->index = 0; +} + +/* + * Compare function for event groups; + * + * Implements complex key that first sorts by CPU and then by virtual index + * which provides ordering when rotating groups for the same CPU. + */ +static bool +perf_event_groups_less(struct perf_event *left, struct perf_event *right) +{ + if (left->cpu < right->cpu) + return true; + if (left->cpu > right->cpu) + return false; + + if (left->group_index < right->group_index) + return true; + if (left->group_index > right->group_index) + return false; + + return false; +} + +/* + * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for + * key (see perf_event_groups_less). This places it last inside the CPU + * subtree. + */ +static void +perf_event_groups_insert(struct perf_event_groups *groups, + struct perf_event *event) +{ + struct perf_event *node_event; + struct rb_node *parent; + struct rb_node **node; + + event->group_index = ++groups->index; + + node = &groups->tree.rb_node; + parent = *node; + + while (*node) { + parent = *node; + node_event = container_of(*node, struct perf_event, group_node); + + if (perf_event_groups_less(event, node_event)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&event->group_node, parent, node); + rb_insert_color(&event->group_node, &groups->tree); +} + +/* + * Helper function to insert event into the pinned or flexible groups. + */ +static void +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_insert(groups, event); +} + +/* + * Delete a group from a tree. + */ +static void +perf_event_groups_delete(struct perf_event_groups *groups, + struct perf_event *event) +{ + WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || + RB_EMPTY_ROOT(&groups->tree)); + + rb_erase(&event->group_node, &groups->tree); + init_event_group(event); +} + +/* + * Helper function to delete event from its groups. + */ +static void +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_delete(groups, event); +} + +/* + * Get the leftmost event in the @cpu subtree. + */ +static struct perf_event * +perf_event_groups_first(struct perf_event_groups *groups, int cpu) +{ + struct perf_event *node_event = NULL, *match = NULL; + struct rb_node *node = groups->tree.rb_node; + + while (node) { + node_event = container_of(node, struct perf_event, group_node); + + if (cpu < node_event->cpu) { + node = node->rb_left; + } else if (cpu > node_event->cpu) { + node = node->rb_right; + } else { + match = node_event; + node = node->rb_left; + } + } + + return match; +} + +/* + * Like rb_entry_next_safe() for the @cpu subtree. + */ +static struct perf_event * +perf_event_groups_next(struct perf_event *event) +{ + struct perf_event *next; + + next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); + if (next && next->cpu == event->cpu) + return next; + + return NULL; +} + +/* + * Iterate through the whole groups tree. + */ +#define perf_event_groups_for_each(event, groups) \ + for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ + typeof(*event), group_node); event; \ + event = rb_entry_safe(rb_next(&event->group_node), \ + typeof(*event), group_node)) + +/* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1500,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { - struct list_head *list; - event->group_caps = event->event_caps; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); + add_event_to_groups(event, ctx); } list_update_cgroup_event(event, ctx, true); @@ -1663,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event) group_leader->group_caps &= event->event_caps; - list_add_tail(&event->group_entry, &group_leader->sibling_list); + list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; perf_event__header_size(group_leader); - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } @@ -1699,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list_del_rcu(&event->event_entry); if (event->group_leader == event) - list_del_init(&event->group_entry); + del_event_from_groups(event, ctx); /* * If event was in error state, then keep it @@ -1717,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; - struct list_head *list = NULL; + struct perf_event_context *ctx = event->ctx; - lockdep_assert_held(&event->ctx->lock); + lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. @@ -1733,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event) * If this is a sibling, remove it from its group. */ if (event->group_leader != event) { - list_del_init(&event->group_entry); + list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; } - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + sibling->group_leader = sibling; + list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; + if (!RB_EMPTY_NODE(&event->group_node)) { + add_event_to_groups(sibling, event->ctx); + + if (sibling->state == PERF_EVENT_STATE_ACTIVE) { + struct list_head *list = sibling->attr.pinned ? + &ctx->pinned_active : &ctx->flexible_active; + + list_add_tail(&sibling->active_list, list); + } + } + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: perf_event__header_size(event->group_leader); - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + for_each_sibling_event(tmp, event->group_leader) perf_event__header_size(tmp); } @@ -1783,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event) */ static inline int pmu_filter_match(struct perf_event *event) { - struct perf_event *child; + struct perf_event *sibling; if (!__pmu_filter_match(event)) return 0; - list_for_each_entry(child, &event->sibling_list, group_entry) { - if (!__pmu_filter_match(child)) + for_each_sibling_event(sibling, event) { + if (!__pmu_filter_match(sibling)) return 0; } @@ -1816,6 +1995,13 @@ event_sched_out(struct perf_event *event, if (event->state != PERF_EVENT_STATE_ACTIVE) return; + /* + * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but + * we can schedule events _OUT_ individually through things like + * __perf_remove_from_context(). + */ + list_del_init(&event->active_list); + perf_pmu_disable(event->pmu); event->pmu->del(event, 0); @@ -1856,7 +2042,7 @@ group_sched_out(struct perf_event *group_event, /* * Schedule out siblings (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) + for_each_sibling_event(event, group_event) event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); @@ -2135,7 +2321,7 @@ group_sched_in(struct perf_event *group_event, /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + for_each_sibling_event(event, group_event) { if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; @@ -2151,7 +2337,7 @@ group_error: * partial group before returning: * The events up to the failed event are scheduled out normally. */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + for_each_sibling_event(event, group_event) { if (event == partial_group) break; @@ -2328,6 +2514,18 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&task_ctx->lock); } +#ifdef CONFIG_CGROUP_PERF + if (is_cgroup_event(event)) { + /* + * If the current cgroup doesn't match the event's + * cgroup, we should not try to schedule it. + */ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + reprogram = cgroup_is_descendant(cgrp->css.cgroup, + event->cgrp->css.cgroup); + } +#endif + if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); @@ -2661,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh) } EXPORT_SYMBOL_GPL(perf_event_refresh); +static int perf_event_modify_breakpoint(struct perf_event *bp, + struct perf_event_attr *attr) +{ + int err; + + _perf_event_disable(bp); + + err = modify_user_hw_breakpoint_check(bp, attr, true); + if (err) { + if (!bp->attr.disabled) + _perf_event_enable(bp); + + return err; + } + + if (!attr->disabled) + _perf_event_enable(bp); + return 0; +} + +static int perf_event_modify_attr(struct perf_event *event, + struct perf_event_attr *attr) +{ + if (event->attr.type != attr->type) + return -EINVAL; + + switch (event->attr.type) { + case PERF_TYPE_BREAKPOINT: + return perf_event_modify_breakpoint(event, attr); + default: + /* Place holder for future additions. */ + return -EOPNOTSUPP; + } +} + static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { + struct perf_event *event, *tmp; int is_active = ctx->is_active; - struct perf_event *event; lockdep_assert_held(&ctx->lock); @@ -2713,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) group_sched_out(event, cpuctx, ctx); } if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); @@ -3005,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static int visit_groups_merge(struct perf_event_groups *groups, int cpu, + int (*func)(struct perf_event *, void *), void *data) { - struct perf_event *event; + struct perf_event **evt, *evt1, *evt2; + int ret; - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; + evt1 = perf_event_groups_first(groups, -1); + evt2 = perf_event_groups_first(groups, cpu); + + while (evt1 || evt2) { + if (evt1 && evt2) { + if (evt1->group_index < evt2->group_index) + evt = &evt1; + else + evt = &evt2; + } else if (evt1) { + evt = &evt1; + } else { + evt = &evt2; + } - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); + ret = func(*evt, data); + if (ret) + return ret; - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + *evt = perf_event_groups_next(*evt); } + + return 0; +} + +struct sched_in_data { + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + int can_add_hw; +}; + +static int pinned_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->pinned_active); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + return 0; +} + +static int flexible_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->flexible_active); + else + sid->can_add_hw = 0; + } + + return 0; +} + +static void +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; + + visit_groups_merge(&ctx->pinned_groups, + smp_processor_id(), + pinned_sched_in, &sid); } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct perf_event *event; - int can_add_hw = 1; - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } + visit_groups_merge(&ctx->flexible_groups, + smp_processor_id(), + flexible_sched_in, &sid); } static void @@ -3132,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!list_empty(&ctx->pinned_groups)) + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); @@ -3361,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, } /* - * Round-robin a context's events: + * Move @event to the tail of the @ctx's elegible events. */ -static void rotate_ctx(struct perf_event_context *ctx) +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); + if (ctx->rotate_disable) + return; + + perf_event_groups_delete(&ctx->flexible_groups, event); + perf_event_groups_insert(&ctx->flexible_groups, event); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx) +static inline struct perf_event * +ctx_first_active(struct perf_event_context *ctx) { + return list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); +} + +static bool perf_rotate_context(struct perf_cpu_context *cpuctx) +{ + struct perf_event *cpu_event = NULL, *task_event = NULL; + bool cpu_rotate = false, task_rotate = false; struct perf_event_context *ctx = NULL; - int rotate = 0; + + /* + * Since we run this from IRQ context, nobody can install new + * events, thus the event count values are stable. + */ if (cpuctx->ctx.nr_events) { if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + cpu_rotate = true; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { if (ctx->nr_events != ctx->nr_active) - rotate = 1; + task_rotate = true; } - if (!rotate) - goto done; + if (!(cpu_rotate || task_rotate)) + return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) + if (task_rotate) + task_event = ctx_first_active(ctx); + if (cpu_rotate) + cpu_event = ctx_first_active(&cpuctx->ctx); + + /* + * As per the order given at ctx_resched() first 'pop' task flexible + * and then, if needed CPU flexible. + */ + if (task_event || (ctx && cpu_event)) ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (cpu_event) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + if (task_event) + rotate_ctx(ctx, task_event); + if (cpu_event) + rotate_ctx(&cpuctx->ctx, cpu_event); perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); -done: - return rotate; + return true; } void perf_event_task_tick(void) @@ -3554,7 +3876,7 @@ static void __perf_event_read(void *info) pmu->read(event); - list_for_each_entry(sub, &event->sibling_list, group_entry) { + for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -3728,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx) raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); + perf_event_groups_init(&ctx->pinned_groups); + perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); + INIT_LIST_HEAD(&ctx->pinned_active); + INIT_LIST_HEAD(&ctx->flexible_active); atomic_set(&ctx->refcount, 1); } @@ -4400,7 +4724,7 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -4594,7 +4918,7 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - list_for_each_entry(sibling, &event->sibling_list, group_entry) + for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } @@ -4676,6 +5000,8 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -4748,6 +5074,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); + + case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { + struct perf_event_attr new_attr; + int err = perf_copy_attr((struct perf_event_attr __user *)arg, + &new_attr); + + if (err) + return err; + + return perf_event_modify_attr(event, &new_attr); + } default: return -ENOTTY; } @@ -5743,7 +6080,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if (leader != event) + if ((leader != event) && + (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); @@ -5752,7 +6090,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && @@ -8009,9 +8347,119 @@ static struct pmu perf_tracepoint = { .read = perf_swevent_read, }; +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +/* + * Flags in config, used by dynamic PMU kprobe and uprobe + * The flags should match following PMU_FORMAT_ATTR(). + * + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe + * if not set, create kprobe/uprobe + */ +enum perf_probe_config { + PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ +}; + +PMU_FORMAT_ATTR(retprobe, "config:0"); + +static struct attribute *probe_attrs[] = { + &format_attr_retprobe.attr, + NULL, +}; + +static struct attribute_group probe_format_group = { + .name = "format", + .attrs = probe_attrs, +}; + +static const struct attribute_group *probe_attr_groups[] = { + &probe_format_group, + NULL, +}; +#endif + +#ifdef CONFIG_KPROBE_EVENTS +static int perf_kprobe_event_init(struct perf_event *event); +static struct pmu perf_kprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_kprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_kprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_kprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_kprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_kprobe_destroy; + + return 0; +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +static int perf_uprobe_event_init(struct perf_event *event); +static struct pmu perf_uprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_uprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_uprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_uprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_uprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_uprobe_destroy; + + return 0; +} +#endif /* CONFIG_UPROBE_EVENTS */ + static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +#ifdef CONFIG_KPROBE_EVENTS + perf_pmu_register(&perf_kprobe, "kprobe", -1); +#endif +#ifdef CONFIG_UPROBE_EVENTS + perf_pmu_register(&perf_uprobe, "uprobe", -1); +#endif } static void perf_event_free_filter(struct perf_event *event) @@ -8088,13 +8536,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event) } #endif +/* + * returns true if the event is a tracepoint, or a kprobe/upprobe created + * with perf_event_open() + */ +static inline bool perf_event_is_tracing(struct perf_event *event) +{ + if (event->pmu == &perf_tracepoint) + return true; +#ifdef CONFIG_KPROBE_EVENTS + if (event->pmu == &perf_kprobe) + return true; +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->pmu == &perf_uprobe) + return true; +#endif + return false; +} + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int ret; - if (event->attr.type != PERF_TYPE_TRACEPOINT) + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog_fd); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; @@ -8140,7 +8607,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) static void perf_event_free_bpf_prog(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) { + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } @@ -8336,7 +8803,8 @@ restart: * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * - * if <size> is not specified, the range is treated as a single address. + * if <size> is not specified or is zero, the range is treated as a single + * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, @@ -8386,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { + static const enum perf_addr_filter_action_t actions[] = { + [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, + [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, + [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, + }; ret = -EINVAL; if (!*start) @@ -8402,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, switch (token) { case IF_ACT_FILTER: case IF_ACT_START: - filter->filter = 1; - case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; + filter->action = actions[token]; state = IF_STATE_SOURCE; break; @@ -8420,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_SOURCE) goto fail; - if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) - filter->range = 1; - *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; - if (filter->range) { + if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) @@ -8436,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { - int fpos = filter->range ? 2 : 1; + int fpos = token == IF_SRC_FILE ? 2 : 1; filename = match_strdup(&args[fpos]); if (!filename) { @@ -8462,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (kernel && event->attr.exclude_kernel) goto fail; + /* + * ACTION "filter" must have a non-zero length region + * specified. + */ + if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && + !filter->size) + goto fail; + if (!kernel) { if (!filename) goto fail; @@ -8559,47 +9036,36 @@ fail_clear_files: return ret; } -static int -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) -{ - struct perf_event_context *ctx = event->ctx; - int ret; - - /* - * Beware, here be dragons!! - * - * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint - * stuff does not actually need it. So temporarily drop ctx->mutex. As per - * perf_event_ctx_lock() we already have a reference on ctx. - * - * This can result in event getting moved to a different ctx, but that - * does not affect the tracepoint state. - */ - mutex_unlock(&ctx->mutex); - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - mutex_lock(&ctx->mutex); - - return ret; -} - static int perf_event_set_filter(struct perf_event *event, void __user *arg) { - char *filter_str; int ret = -EINVAL; - - if ((event->attr.type != PERF_TYPE_TRACEPOINT || - !IS_ENABLED(CONFIG_EVENT_TRACING)) && - !has_addr_filter(event)) - return -EINVAL; + char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); - if (IS_ENABLED(CONFIG_EVENT_TRACING) && - event->attr.type == PERF_TYPE_TRACEPOINT) - ret = perf_tracepoint_set_filter(event, filter_str); - else if (has_addr_filter(event)) +#ifdef CONFIG_EVENT_TRACING + if (perf_event_is_tracing(event)) { + struct perf_event_context *ctx = event->ctx; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but + * the tracepoint stuff does not actually need it. So + * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we + * already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, + * but that does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + } else +#endif + if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); @@ -9452,9 +9918,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); - INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->active_list); + init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); @@ -9729,6 +10196,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (!attr->sample_max_stack) + attr->sample_max_stack = sysctl_perf_event_max_stack; + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); out: @@ -9942,9 +10412,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; - if (!attr.sample_max_stack) - attr.sample_max_stack = sysctl_perf_event_max_stack; - /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -10218,8 +10685,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(group_leader, 0); put_ctx(gctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -10240,8 +10706,7 @@ SYSCALL_DEFINE5(perf_event_open, * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); @@ -10880,7 +11345,7 @@ static int inherit_group(struct perf_event *parent_event, * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) @@ -10979,7 +11444,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) @@ -10995,7 +11460,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3f8cb1e..6e28d28 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -44,6 +44,7 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/bug.h> #include <linux/hw_breakpoint.h> /* @@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp) return 1; } -static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +static inline enum bp_type_idx find_slot_idx(u64 bp_type) { - if (bp->attr.bp_type & HW_BREAKPOINT_RW) + if (bp_type & HW_BREAKPOINT_RW) return TYPE_DATA; return TYPE_INST; @@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.target == tsk && - find_slot_idx(iter) == type && + find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } @@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ -static int __reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { struct bp_busy_slots slots = {0}; enum bp_type_idx type; @@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp) return -ENOMEM; /* Basic checks */ - if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || - bp->attr.bp_type == HW_BREAKPOINT_INVALID) + if (bp_type == HW_BREAKPOINT_EMPTY || + bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; - type = find_slot_idx(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); @@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - ret = __reserve_bp_slot(bp); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); return ret; } -static void __release_bp_slot(struct perf_event *bp) +static void __release_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int weight; - type = find_slot_idx(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); } @@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); arch_unregister_hw_breakpoint(bp); - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); } +static int __modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int err; + + __release_bp_slot(bp, old_type); + + err = __reserve_bp_slot(bp, bp->attr.bp_type); + if (err) { + /* + * Reserve the old_type slot back in case + * there's no space for the new type. + * + * This must succeed, because we just released + * the old_type slot in the __release_bp_slot + * call above. If not, something is broken. + */ + WARN_ON(__reserve_bp_slot(bp, old_type)); + } + + return err; +} + +static int modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + ret = __modify_bp_slot(bp, old_type); + mutex_unlock(&nr_bp_mutex); + return ret; +} + /* * Allow the kernel debugger to reserve breakpoint slots without * taking a lock using the dbg_* variant of for the reserve and @@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - return __reserve_bp_slot(bp); + return __reserve_bp_slot(bp, bp->attr.bp_type); } int dbg_release_bp_slot(struct perf_event *bp) @@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); return 0; } @@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); +int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check) +{ + u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; + int old_type = bp->attr.bp_type; + bool modify = attr->bp_type != old_type; + int err = 0; + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (check && memcmp(&bp->attr, attr, sizeof(*attr))) + return -EINVAL; + + err = validate_hw_breakpoint(bp); + if (!err && modify) + err = modify_bp_slot(bp, old_type); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + return err; + } + + bp->attr.disabled = attr->disabled; + return 0; +} + /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify * @attr: new breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - int err = 0; - /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; - - if (attr->disabled) - goto end; + if (!attr->disabled) { + int err = modify_user_hw_breakpoint_check(bp, attr, false); - err = validate_hw_breakpoint(bp); - if (!err) + if (err) + return err; perf_event_enable(bp); - - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - if (!bp->attr.disabled) - perf_event_enable(bp); - - return err; + bp->attr.disabled = 0; } - -end: - bp->attr.disabled = attr->disabled; - return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/exit.c b/kernel/exit.c index 995453d..c3c7ac5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { - return sys_wait4(pid, stat_addr, options, NULL); + return kernel_wait4(pid, stat_addr, options, NULL); } #endif diff --git a/kernel/fork.c b/kernel/fork.c index e5d9d40..f71b67d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1198,8 +1198,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * not set up a proper pointer then tough luck. */ put_user(0, tsk->clear_child_tid); - sys_futex(tsk->clear_child_tid, FUTEX_WAKE, - 1, NULL, NULL, 0); + do_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0, 0); } tsk->clear_child_tid = NULL; } @@ -2354,7 +2354,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp * constructed. Here we are modifying the current, active, * task_struct. */ -SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +int ksys_unshare(unsigned long unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; @@ -2470,6 +2470,11 @@ bad_unshare_out: return err; } +SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +{ + return ksys_unshare(unshare_flags); +} + /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to diff --git a/kernel/kexec.c b/kernel/kexec.c index e62ec4d..aed8fb2 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -192,11 +192,9 @@ out: * that to happen you need to do that yourself. */ -SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, - struct kexec_segment __user *, segments, unsigned long, flags) +static inline int kexec_load_check(unsigned long nr_segments, + unsigned long flags) { - int result; - /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; @@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) return -EINVAL; - /* Verify we are on the appropriate architecture */ - if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && - ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) - return -EINVAL; - /* Put an artificial cap on the number * of segments passed to kexec_load. */ if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; + return 0; +} + +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) +{ + int result; + + result = kexec_load_check(nr_segments, flags); + if (result) + return result; + + /* Verify we are on the appropriate architecture */ + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) + return -EINVAL; + /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, struct kexec_segment out, __user *ksegments; unsigned long i, result; + result = kexec_load_check(nr_segments, flags); + if (result) + return result; + /* Don't allow clients that don't understand the native * architecture to do anything. */ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); @@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EFAULT; } - return sys_kexec_load(entry, nr_segments, ksegments, flags); + /* Because we write directly to the reserved memory + * region when loading crash kernels we need a mutex here to + * prevent multiple crash kernels from attempting to load + * simultaneously, and to prevent a crash kernel from loading + * over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + result = do_kexec_load(entry, nr_segments, ksegments, flags); + + mutex_unlock(&kexec_mutex); + + return result; } #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 89b5f83..0233863 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock) return; } + printk(KERN_CONT "%p", hlock->instance); print_lock_name(lock_classes + class_idx - 1); - printk(KERN_CONT ", at: [<%p>] %pS\n", - (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); + printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } static void lockdep_print_held_locks(struct task_struct *curr) @@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - printk("\nnew class %p: %s", class->key, class->name); + printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) } printk("%*s }\n", depth, ""); - printk("%*s ... key at: [<%p>] %pS\n", + printk("%*s ... key at: [<%px>] %pS\n", depth, "", class->key, class->key); } @@ -2340,7 +2340,7 @@ cache_hit: if (very_verbose(class)) { printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", + "%016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2349,7 +2349,7 @@ cache_hit: } if (very_verbose(class)) { - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): [<%p>] %pS\n", + printk("hardirqs last enabled at (%u): [<%px>] %pS\n", curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, (void *)curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): [<%p>] %pS\n", + printk("hardirqs last disabled at (%u): [<%px>] %pS\n", curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, (void *)curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): [<%p>] %pS\n", + printk("softirqs last enabled at (%u): [<%px>] %pS\n", curr->softirq_enable_event, (void *)curr->softirq_enable_ip, (void *)curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): [<%p>] %pS\n", + printk("softirqs last disabled at (%u): [<%px>] %pS\n", curr->softirq_disable_event, (void *)curr->softirq_disable_ip, (void *)curr->softirq_disable_ip); } @@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, * Sanity check, the lock-class key must be persistent: */ if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); + printk("BUG: key %px not in .data!\n", key); /* * What it says above ^^^^^, I suggest you read it. */ @@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } atomic_inc((atomic_t *)&class->ops); if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); + printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 940633c..4f014be 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) { __set_current_state(TASK_RUNNING); - if (rt_mutex_has_waiters(lock)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 68686b3..d1d62f9 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { - struct rt_mutex_waiter *w; - - w = rb_entry(lock->waiters.rb_leftmost, - struct rt_mutex_waiter, tree_entry); - BUG_ON(w->lock != lock); + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + struct rt_mutex_waiter *w = NULL; + if (leftmost) { + w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + BUG_ON(w->lock != lock); + } return w; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f549c55..30465a2 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } @@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_clear_owner(sem); __up_write(sem); @@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_set_reader_owned(sem); __downgrade_write(sem); @@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a883b8f..a17cba8 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -16,6 +16,12 @@ */ #define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) +#else +# define DEBUG_RWSEMS_WARN_ON(c) +#endif + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) * do a write to the rwsem cacheline when it is really necessary * to minimize cacheline contention. */ - if (sem->owner != RWSEM_READER_OWNED) + if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } diff --git a/kernel/module.c b/kernel/module.c index e42764a..a6e43a5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2181,10 +2181,6 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); - -#ifdef CONFIG_MPU - update_protections(current->mm); -#endif } void *__symbol_get(const char *symbol) diff --git a/kernel/panic.c b/kernel/panic.c index 4b794f1..9d833d9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -289,7 +289,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif - pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); + pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0b53eef..93b57f0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -242,16 +242,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. - * sys_wait4() will also block until our children traced from the + * kernel_wait4() will also block until our children traced from the * parent namespace are detached and become EXIT_DEAD. */ do { clear_thread_flag(TIF_SIGPENDING); - rc = sys_wait4(-1, NULL, __WALL, NULL); + rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* - * sys_wait4() above can't reap the EXIT_DEAD children but we do not + * kernel_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5c36e9..4710f1b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -701,7 +701,7 @@ int hibernate(void) } pr_info("Syncing filesystems ... \n"); - sys_sync(); + ksys_sync(); pr_info("done.\n"); error = freeze_processes(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0685c44..4c10be0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); pr_info("Syncing filesystems ... "); - sys_sync(); + ksys_sync(); pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif diff --git a/kernel/power/user.c b/kernel/power/user.c index 22df9f7..75c959d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -224,7 +224,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; printk("Syncing filesystems ... "); - sys_sync(); + ksys_sync(); printk("done.\n"); error = freeze_processes(); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6334f2c..7a693e3 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp) WARN_ON_ONCE(rcu_seq_state(*sp) != 1); } +/* Compute the end-of-grace-period value for the specified sequence number. */ +static inline unsigned long rcu_seq_endval(unsigned long *sp) +{ + return (*sp | RCU_SEQ_STATE_MASK) + 1; +} + /* Adjust sequence number for end of update-side operation. */ static inline void rcu_seq_end(unsigned long *sp) { smp_mb(); /* Ensure update-side operation before counter increment. */ WARN_ON_ONCE(!rcu_seq_state(*sp)); - WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); + WRITE_ONCE(*sp, rcu_seq_endval(sp)); } /* Take a snapshot of the update side's sequence number. */ @@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) + for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + (cpu) <= rnp->grphi; \ + (cpu) = cpumask_next((cpu), cpu_possible_mask)) + +/* + * Iterate over all CPUs in a leaf RCU node's specified mask. + */ +#define rcu_find_next_bit(rnp, cpu, mask) \ + ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) +#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ + for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + (cpu) <= rnp->grphi; \ + (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) /* * Wrappers for the rcu_node::lock acquire and release. @@ -337,7 +353,7 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) #define raw_spin_trylock_rcu_node(p) \ ({ \ @@ -348,6 +364,9 @@ do { \ ___locked; \ }) +#define raw_lockdep_assert_held_rcu_node(p) \ + lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ #ifdef CONFIG_TINY_RCU @@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 -#ifdef CONFIG_TINY_RCU -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } -#else /* #ifdef CONFIG_TINY_RCU */ -void rcu_request_urgent_qs_task(struct task_struct *t); -#endif /* #else #ifdef CONFIG_TINY_RCU */ - enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, @@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +extern struct workqueue_struct *rcu_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d1ebdf9..777e7a6 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +/* + * The intended use cases for the nreaders and nwriters module parameters + * are as follows: + * + * 1. Specify only the nr_cpus kernel boot parameter. This will + * set both nreaders and nwriters to the value specified by + * nr_cpus for a mixed reader/writer test. + * + * 2. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nreaders to zero. This will set nwriters to the + * value specified by nr_cpus for an update-only test. + * + * 3. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nwriters to zero. This will set nreaders to the + * value specified by nr_cpus for a read-only test. + * + * Various other use cases may of course be specified. + */ + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, 0, "Number of RCU reader threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 308e6fd..680c96d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -909,34 +909,38 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) { + if (!can_expedite) pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Disabled dynamic grace-period expediting.\n", - torture_type); - } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; - else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) - pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); - if (gp_exp1 && cur_ops->exp_sync) + pr_info("%s: Testing conditional GPs.\n", __func__); + } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + pr_alert("%s: gp_cond without primitives.\n", __func__); + } + if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; - else if (gp_exp && !cur_ops->exp_sync) - pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); - if (gp_normal1 && cur_ops->deferred_free) + pr_info("%s: Testing expedited GPs.\n", __func__); + } else if (gp_exp && !cur_ops->exp_sync) { + pr_alert("%s: gp_exp without primitives.\n", __func__); + } + if (gp_normal1 && cur_ops->deferred_free) { synctype[nsynctypes++] = RTWS_DEF_FREE; - else if (gp_normal && !cur_ops->deferred_free) - pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); - if (gp_sync1 && cur_ops->sync) + pr_info("%s: Testing asynchronous GPs.\n", __func__); + } else if (gp_normal && !cur_ops->deferred_free) { + pr_alert("%s: gp_normal without primitives.\n", __func__); + } + if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; - else if (gp_sync && !cur_ops->sync) - pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); + pr_info("%s: Testing normal GPs.\n", __func__); + } else if (gp_sync && !cur_ops->sync) { + pr_alert("%s: gp_sync without primitives.\n", __func__); + } if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg) rcu_unexpedite_gp(); if (++expediting > 3) expediting = -expediting; + } else if (!can_expedite) { /* Disabled during boot, recheck. */ + can_expedite = !rcu_gp_is_expedited() && + !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); @@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg) while (can_expedite && expediting++ < 0) rcu_unexpedite_gp(); WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " Dynamic grace-period expediting was disabled.\n", + torture_type); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg) torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else if (gp_normal == gp_exp) { - if (torture_random(&rand) & 0x80) + if (cur_ops->sync && torture_random(&rand) & 0x80) cur_ops->sync(); - else + else if (cur_ops->exp_sync) cur_ops->exp_sync(); - } else if (gp_normal) { + } else if (gp_normal && cur_ops->sync) { cur_ops->sync(); - } else { + } else if (cur_ops->exp_sync) { cur_ops->exp_sync(); } stutter_wait("rcu_torture_fakewriter"); @@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void) atomic_set(&barrier_cbs_count, 0); atomic_set(&barrier_cbs_invoked, 0); barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]), GFP_KERNEL); barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL); if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { @@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) * next grace period. Unlikely, but can happen. If it * does happen, the debug-objects subsystem won't have splatted. */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); + pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void) init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); /* Try to queue the rh2 pair of callbacks for the same grace period. */ preempt_disable(); /* Prevent preemption from interrupting test. */ @@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void) /* Wait for them all to get done so we can safely return. */ rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } @@ -1799,7 +1809,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; if (nfakewriters > 0) { - fakewriter_tasks = kzalloc(nfakewriters * + fakewriter_tasks = kcalloc(nfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -1814,7 +1824,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("out of memory"); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81..fb560fc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); @@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - lockdep_assert_held(&sp->lock); + lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, - &sdp->work, delay); + srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); } /* @@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + bool last_lvl; int cpu; unsigned long flags; unsigned long gpseq; int idx; - int idxnext; unsigned long mask; struct srcu_data *sdp; struct srcu_node *snp; @@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp) /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - if (snp >= sp->level[rcu_num_lvls - 1]) + last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); @@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check)) + if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp) ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); spin_unlock_irq_rcu_node(sp); - /* Throttle expedited grace periods: Should be rare! */ - srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff - ? 0 : SRCU_INTERVAL); + srcu_reschedule(sp, 0); } else { spin_unlock_irq_rcu_node(sp); } @@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_rcu_node(sp, flags); - if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(system_power_efficient_wq, &sp->work, - srcu_get_delay(sp)); + queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); } spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) spin_unlock_irq_rcu_node(sp); if (pushgp) - queue_delayed_work(system_power_efficient_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &sp->work, delay); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 491bdf3..2a73469 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) @@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_kthread ? rsp->gp_kthread->state : ~0, rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { + pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); } @@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * If RCU is idle, we just wait for the next grace period. @@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * Pick up grace-period number for new callbacks. If this @@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -2296,7 +2297,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - lockdep_assert_held(&rcu_get_root(rsp)->lock); + raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Walk up the rcu_node hierarchy. */ for (;;) { @@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->n_cbs_invoked += count; rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ @@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp) !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); - if (ret) { - rsp->n_force_qs_lh++; + if (ret) return; - } rnp_old = rnp; } /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ @@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } @@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - rdp->n_rcu_pending++; - /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); @@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rcu_scheduler_fully_active && - rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { - rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { - rdp->n_rp_report_qs++; + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) return 1; - } /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rdp->n_rp_cb_ready++; + if (rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; - } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - rdp->n_rp_cpu_needs_gp++; + if (cpu_needs_another_gp(rsp, rdp)) return 1; - } /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ - rdp->n_rp_gp_completed++; + if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ return 1; - } /* Has a new RCU grace period started? */ if (READ_ONCE(rnp->gpnum) != rdp->gpnum || - unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ - rdp->n_rp_gp_started++; + unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - } /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) { - rdp->n_rp_nocb_defer_wakeup++; + if (rcu_nocb_need_deferred_wakeup(rdp)) return 1; - } /* nothing to do */ - rdp->n_rp_need_nothing++; return 0; } @@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; @@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); @@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) pr_cont("\n"); } +struct workqueue_struct *rcu_gp_wq; + void __init rcu_init(void) { int cpu; @@ -4219,6 +4195,10 @@ void __init rcu_init(void) rcu_cpu_starting(cpu); rcutree_online_cpu(cpu); } + + /* Create workqueue for expedited GPs and for Tree SRCU. */ + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6488a3b..f491ab4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -146,12 +146,6 @@ struct rcu_node { /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -184,13 +178,6 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; -/* Index values for nxttail array in struct rcu_data. */ -#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ -#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ -#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ -#define RCU_NEXT_TAIL 3 -#define RCU_NEXT_SIZE 4 - /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -217,8 +204,6 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ - unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -234,18 +219,7 @@ struct rcu_data { /* Grace period that needs help */ /* from cond_resched(). */ - /* 5) __rcu_pending() statistics. */ - unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_core_needs_qs; - unsigned long n_rp_report_qs; - unsigned long n_rp_cb_ready; - unsigned long n_rp_cpu_needs_gp; - unsigned long n_rp_gp_completed; - unsigned long n_rp_gp_started; - unsigned long n_rp_nocb_defer_wakeup; - unsigned long n_rp_need_nothing; - - /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ + /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; @@ -256,7 +230,7 @@ struct rcu_data { atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ - /* 7) Callback offloading. */ + /* 6) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; @@ -283,7 +257,7 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 8) RCU CPU stall data. */ + /* 7) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -374,10 +348,6 @@ struct rcu_state { /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ - unsigned long n_force_qs_lh; /* ~Number of calls leaving */ - /* due to lock unavailable. */ - unsigned long n_force_qs_ngp; /* Number of calls leaving */ - /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b5..f72eefa 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) } /* + * Return then value that expedited-grace-period counter will have + * at the end of the current grace period. + */ +static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +{ + return rcu_seq_endval(&rsp->expedited_sequence); +} + +/* * Record the end of an expedited grace period. */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) @@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, int ret; struct rcu_node *rnp; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; - rdp->exp_dynticks_snap = - rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || - !(rnp->qsmaskinitnext & rdp->grpmask)) - mask_ofl_test |= rdp->grpmask; + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) + mask_ofl_test |= mask; + else + rdp->exp_dynticks_snap = snap; + } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); @@ -417,6 +435,7 @@ retry_ipi: (rnp->expmask & mask)) { /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); schedule_timeout_uninterruptible(1); goto retry_ipi; } @@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; @@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + queue_work(rcu_gp_wq, &rew.rew_work); } /* Wait for expedited grace period to complete. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fb88a02..84fbee4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); @@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); sched_show_task(t); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp) * expedited grace period must boost all blocked tasks, including * those blocking the pre-existing normal grace period. */ - if (rnp->exp_tasks != NULL) { + if (rnp->exp_tasks != NULL) tb = rnp->exp_tasks; - rnp->n_exp_boosts++; - } else { + else tb = rnp->boost_tasks; - rnp->n_normal_boosts++; - } - rnp->n_tasks_boosted++; /* * We boost task t by manufacturing an rt_mutex that appears to @@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) char *ticks_title; unsigned long ticks_value; + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + if (rsp->gpnum == rdp->gpnum) { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; @@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg) smp_mb__before_atomic(); /* _add after CB invocation. */ atomic_long_add(-c, &rdp->nocb_q_count); atomic_long_add(-cl, &rdp->nocb_q_count_lazy); - rdp->n_nocbs_invoked += c; } return 0; } @@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void) cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e2f9d4f..d9a02b3 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o idle.o +obj-y += idle.o fair.o rt.o deadline.o +obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index bb4b9fe..6be6c57 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,10 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/utsname.h> -#include <linux/security.h> -#include <linux/export.h> - +/* + * Auto-group scheduling implementation: + */ #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; @@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) autogroup_kref_put(prev); } -/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +/* Allocates GFP_KERNEL, cannot be called under any spinlock: */ void sched_autogroup_create_attach(struct task_struct *p) { struct autogroup *ag = autogroup_create(); autogroup_move_group(p, ag); - /* drop extra reference added by autogroup_create() */ + + /* Drop extra reference added by autogroup_create(): */ autogroup_kref_put(ag); } EXPORT_SYMBOL(sched_autogroup_create_attach); -/* Cannot be called under siglock. Currently has no users */ +/* Cannot be called under siglock. Currently has no users: */ void sched_autogroup_detach(struct task_struct *p) { autogroup_move_group(p, &autogroup_default); @@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str) return 1; } - __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS @@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) if (nice < 0 && !can_nice(current, nice)) return -EPERM; - /* this is a heavy operation taking global locks.. */ + /* This is a heavy operation, taking global locks.. */ if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) return -EAGAIN; @@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } -#endif /* CONFIG_SCHED_DEBUG */ +#endif diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 27cd22b..b964199 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,15 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_SCHED_AUTOGROUP -#include <linux/kref.h> -#include <linux/rwsem.h> -#include <linux/sched/autogroup.h> - struct autogroup { /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. + * Reference doesn't mean how many threads attach to this + * autogroup now. It just stands for the number of tasks + * which could use this autogroup. */ struct kref kref; struct task_group *tg; @@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) return tg; } -#ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { return 0; } -#endif #endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e086bab..10c83e7 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,5 +1,5 @@ /* - * sched_clock for unstable cpu clocks + * sched_clock() for unstable CPU clocks * * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * @@ -11,7 +11,7 @@ * Guillaume Chazarain <guichaz@gmail.com> * * - * What: + * What this file implements: * * cpu_clock(i) provides a fast (execution time) high resolution * clock with bounded drift between CPUs. The value of cpu_clock(i) @@ -26,11 +26,11 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * local_clock() -- is cpu_clock() on the current cpu. + * local_clock() -- is cpu_clock() on the current CPU. * * sched_clock_cpu(i) * - * How: + * How it is implemented: * * The implementation either uses sched_clock() when * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the @@ -52,19 +52,7 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include <linux/spinlock.h> -#include <linux/hardirq.h> -#include <linux/export.h> -#include <linux/percpu.h> -#include <linux/ktime.h> -#include <linux/sched.h> -#include <linux/nmi.h> -#include <linux/sched/clock.h> -#include <linux/static_key.h> -#include <linux/workqueue.h> -#include <linux/compiler.h> -#include <linux/tick.h> -#include <linux/init.h> +#include "sched.h" /* * Scheduler clock - returns current time in nanosec units. @@ -302,21 +290,21 @@ again: * cmpxchg64 below only protects one readout. * * We must reread via sched_clock_local() in the retry case on - * 32bit as an NMI could use sched_clock_local() via the + * 32-bit kernels as an NMI could use sched_clock_local() via the * tracer and hit between the readout of - * the low32bit and the high 32bit portion. + * the low 32-bit and the high 32-bit portion. */ this_clock = sched_clock_local(my_scd); /* - * We must enforce atomic readout on 32bit, otherwise the - * update on the remote cpu can hit inbetween the readout of - * the low32bit and the high 32bit portion. + * We must enforce atomic readout on 32-bit, otherwise the + * update on the remote CPU can hit inbetween the readout of + * the low 32-bit and the high 32-bit portion. */ remote_clock = cmpxchg64(&scd->clock, 0, 0); #else /* - * On 64bit the read of [my]scd->clock is atomic versus the - * update, so we can avoid the above 32bit dance. + * On 64-bit kernels the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32-bit dance. */ sched_clock_local(my_scd); again: diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef..e426b0c 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -11,10 +11,7 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ - -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/completion.h> +#include "sched.h" /** * complete: - signals a single thread waiting on this completion @@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); bool try_wait_for_completion(struct completion *x) { unsigned long flags; - int ret = 1; + bool ret = true; /* * Since x->done will need to be locked only @@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x) * return early in the blocking case. */ if (!READ_ONCE(x->done)) - return 0; + return false; spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) - ret = 0; + ret = false; else if (x->done != UINT_MAX) x->done--; spin_unlock_irqrestore(&x->wait.lock, flags); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c94895b..28b6899 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5,37 +5,11 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#include <linux/sched.h> -#include <linux/sched/clock.h> -#include <uapi/linux/sched/types.h> -#include <linux/sched/loadavg.h> -#include <linux/sched/hotplug.h> -#include <linux/wait_bit.h> -#include <linux/cpuset.h> -#include <linux/delayacct.h> -#include <linux/init_task.h> -#include <linux/context_tracking.h> -#include <linux/rcupdate_wait.h> -#include <linux/compat.h> - -#include <linux/blkdev.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/prefetch.h> -#include <linux/profile.h> -#include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/sched/isolation.h> +#include "sched.h" #include <asm/switch_to.h> #include <asm/tlb.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#endif -#include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old cpu in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock, the acquire of * the old rq->lock will fully serialize against the stores. * * If we observe the new CPU in task_rq_lock, the acquire will @@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay) } #endif /* CONFIG_SMP */ -static void init_rq_hrtick(struct rq *rq) +static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP rq->hrtick_csd_pending = 0; @@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq) { } -static inline void init_rq_hrtick(struct rq *rq) +static inline void hrtick_rq_init(struct rq *rq) { } #endif /* CONFIG_SCHED_HRTICK */ @@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) return false; if (idle_cpu(cpu) && !need_resched()) @@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) * We can't run Idle Load Balance on this CPU for this time so we * cancel it and clear NOHZ_BALANCE_KICK */ - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); return false; } @@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process); * * - cpu_active must be a subset of cpu_online * - * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, * see __set_cpus_allowed_ptr(). At this point the newly online * CPU isn't yet part of the sched domains, and balancing will not * see it. @@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS -static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; +static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); void preempt_notifier_inc(void) { - static_key_slow_inc(&preempt_notifier_key); + static_branch_inc(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_inc); void preempt_notifier_dec(void) { - static_key_slow_dec(&preempt_notifier_key); + static_branch_dec(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_dec); @@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); */ void preempt_notifier_register(struct preempt_notifier *notifier) { - if (!static_key_false(&preempt_notifier_key)) + if (!static_branch_unlikely(&preempt_notifier_key)) WARN(1, "registering preempt_notifier while notifiers disabled\n"); hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); @@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_in_preempt_notifiers(curr); } @@ -2555,7 +2529,7 @@ static __always_inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_out_preempt_notifiers(curr, next); } @@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) raw_spin_unlock_irq(&rq->lock); } +/* + * NOP if the arch has not defined these: + */ + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif + +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* - * 64-bit doesn't need locks to atomically read a 64bit value. + * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. * Reading ->on_cpu is racy, but this is ok. * @@ -3096,35 +3082,99 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif - rq_last_tick_reset(rq); } #ifdef CONFIG_NO_HZ_FULL -/** - * scheduler_tick_max_deferment - * - * Keep at least one tick per second when a single - * active task is running because the scheduler doesn't - * yet completely support full dynticks environment. - * - * This makes sure that uptime, CFS vruntime, load - * balancing, etc... continue to move forward, even - * with a very low granularity. - * - * Return: Maximum deferment in nanoseconds. - */ -u64 scheduler_tick_max_deferment(void) + +struct tick_work { + int cpu; + struct delayed_work work; +}; + +static struct tick_work __percpu *tick_work_cpu; + +static void sched_tick_remote(struct work_struct *work) { - struct rq *rq = this_rq(); - unsigned long next, now = READ_ONCE(jiffies); + struct delayed_work *dwork = to_delayed_work(work); + struct tick_work *twork = container_of(dwork, struct tick_work, work); + int cpu = twork->cpu; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + /* + * Handle the tick only if it appears the remote CPU is running in full + * dynticks mode. The check is racy by nature, but missing a tick or + * having one too much is no big deal because the scheduler tick updates + * statistics and checks timeslices in a time-independent way, regardless + * of when exactly it is running. + */ + if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { + struct task_struct *curr; + u64 delta; - next = rq->last_sched_tick + HZ; + rq_lock_irq(rq, &rf); + update_rq_clock(rq); + curr = rq->curr; + delta = rq_clock_task(rq) - curr->se.exec_start; - if (time_before_eq(next, now)) - return 0; + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + rq_unlock_irq(rq, &rf); + } - return jiffies_to_nsecs(next - now); + /* + * Run the remote tick once per second (1Hz). This arbitrary + * frequency is large enough to avoid overload but short enough + * to keep scheduler internal stats reasonably up to date. + */ + queue_delayed_work(system_unbound_wq, dwork, HZ); +} + +static void sched_tick_start(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); } + +#ifdef CONFIG_HOTPLUG_CPU +static void sched_tick_stop(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + cancel_delayed_work_sync(&twork->work); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __init sched_tick_offload_init(void) +{ + tick_work_cpu = alloc_percpu(struct tick_work); + BUG_ON(!tick_work_cpu); + + return 0; +} + +#else /* !CONFIG_NO_HZ_FULL */ +static inline void sched_tick_start(int cpu) { } +static inline void sched_tick_stop(int cpu) { } #endif #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -4892,7 +4942,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, * * Return: 0. */ -SYSCALL_DEFINE0(sched_yield) +static void do_sched_yield(void) { struct rq_flags rf; struct rq *rq; @@ -4913,7 +4963,11 @@ SYSCALL_DEFINE0(sched_yield) sched_preempt_enable_no_resched(); schedule(); +} +SYSCALL_DEFINE0(sched_yield) +{ + do_sched_yield(); return 0; } @@ -4997,7 +5051,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); void __sched yield(void) { set_current_state(TASK_RUNNING); - sys_sched_yield(); + do_sched_yield(); } EXPORT_SYMBOL(yield); @@ -5786,6 +5840,7 @@ int sched_cpu_starting(unsigned int cpu) { set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); + sched_tick_start(cpu); return 0; } @@ -5797,6 +5852,7 @@ int sched_cpu_dying(unsigned int cpu) /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); + sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); if (rq->rd) { @@ -5809,7 +5865,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(cpu); + nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; } @@ -6022,13 +6078,11 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_load_update_tick = jiffies; - rq->nohz_flags = 0; -#endif -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = 0; + rq->last_blocked_load_update_tick = jiffies; + atomic_set(&rq->nohz_flags, 0); #endif #endif /* CONFIG_SMP */ - init_rq_hrtick(rq); + hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); } @@ -7027,3 +7081,5 @@ const u32 sched_prio_to_wmult[40] = { /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; + +#undef CREATE_TRACE_POINTS diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 44ab32a..9fbb103 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,24 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/cgroup.h> -#include <linux/slab.h> -#include <linux/percpu.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/seq_file.h> -#include <linux/rcupdate.h> -#include <linux/kernel_stat.h> -#include <linux/err.h> - -#include "sched.h" - /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include "sched.h" -/* Time spent by the tasks of the cpu accounting group executing in ... */ +/* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { CPUACCT_STAT_USER, /* ... user mode */ CPUACCT_STAT_SYSTEM, /* ... kernel mode */ @@ -35,12 +24,12 @@ struct cpuacct_usage { u64 usages[CPUACCT_STAT_NSTATS]; }; -/* track cpu usage of a group of tasks and its child groups */ +/* track CPU usage of a group of tasks and its child groups */ struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - struct cpuacct_usage __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every CPU */ + struct cpuacct_usage __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; }; static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) @@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuacct, css) : NULL; } -/* return cpu accounting group to which this task belongs */ +/* Return CPU accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { return css_ca(task_css(tsk, cpuacct_cgrp_id)); @@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = { .cpuusage = &root_cpuacct_cpuusage, }; -/* create a new cpu accounting group */ +/* Create a new CPU accounting group */ static struct cgroup_subsys_state * cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -96,7 +85,7 @@ out: return ERR_PTR(-ENOMEM); } -/* destroy an existing cpu accounting group */ +/* Destroy an existing CPU accounting group */ static void cpuacct_css_free(struct cgroup_subsys_state *css) { struct cpuacct *ca = css_ca(css); @@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) #endif } -/* return total cpu usage (in nanoseconds) of a group */ +/* Return total CPU usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, enum cpuacct_stat_index index) { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8d9562d..5031645 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -10,11 +10,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ - -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include "cpudeadline.h" +#include "sched.h" static inline int parent(int i) { @@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) return; /* adapted from lib/prio_heap.c */ - while(1) { + while (1) { u64 largest_dl; + l = left_child(idx); r = right_child(idx); largest = idx; @@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, return 1; } else { int best_cpu = cpudl_maximum(cp); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && @@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } /* - * cpudl_clear - remove a cpu from the cpudl max-heap + * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context - * @cpu: the target cpu + * @cpu: the target CPU * * Notes: assumes cpu_rq(cpu)->lock is locked * @@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) /* * cpudl_set - update the cpudl max-heap * @cp: the cpudl max-heap context - * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu + * @cpu: the target CPU + * @dl: the new earliest deadline for this CPU * * Notes: assumes cpu_rq(cpu)->lock is locked * @@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; @@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) /* * cpudl_set_freecpu - Set the cpudl.free_cpus * @cp: the cpudl max-heap context - * @cpu: rd attached cpu + * @cpu: rd attached CPU */ void cpudl_set_freecpu(struct cpudl *cp, int cpu) { @@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) /* * cpudl_clear_freecpu - Clear the cpudl.free_cpus * @cp: the cpudl max-heap context - * @cpu: rd attached cpu + * @cpu: rd attached CPU */ void cpudl_clear_freecpu(struct cpudl *cp, int cpu) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index b010d26..0adeda9 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,35 +1,26 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CPUDL_H -#define _LINUX_CPUDL_H -#include <linux/sched.h> -#include <linux/sched/deadline.h> - -#define IDX_INVALID -1 +#define IDX_INVALID -1 struct cpudl_item { - u64 dl; - int cpu; - int idx; + u64 dl; + int cpu; + int idx; }; struct cpudl { - raw_spinlock_t lock; - int size; - cpumask_var_t free_cpus; - struct cpudl_item *elements; + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; }; - #ifdef CONFIG_SMP -int cpudl_find(struct cpudl *cp, struct task_struct *p, - struct cpumask *later_mask); +int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); -int cpudl_init(struct cpudl *cp); +int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); #endif /* CONFIG_SMP */ - -#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc5144..5e54cbc 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -8,7 +8,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #include "sched.h" DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 617c674..d2c6083 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -11,61 +11,56 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cpufreq.h> -#include <linux/kthread.h> -#include <uapi/linux/sched/types.h> -#include <linux/slab.h> -#include <trace/events/power.h> - #include "sched.h" +#include <trace/events/power.h> + struct sugov_tunables { - struct gov_attr_set attr_set; - unsigned int rate_limit_us; + struct gov_attr_set attr_set; + unsigned int rate_limit_us; }; struct sugov_policy { - struct cpufreq_policy *policy; - - struct sugov_tunables *tunables; - struct list_head tunables_hook; - - raw_spinlock_t update_lock; /* For shared policies */ - u64 last_freq_update_time; - s64 freq_update_delay_ns; - unsigned int next_freq; - unsigned int cached_raw_freq; - - /* The next fields are only needed if fast switch cannot be used. */ - struct irq_work irq_work; - struct kthread_work work; - struct mutex work_lock; - struct kthread_worker worker; - struct task_struct *thread; - bool work_in_progress; - - bool need_freq_update; + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 freq_update_delay_ns; + unsigned int next_freq; + unsigned int cached_raw_freq; + + /* The next fields are only needed if fast switch cannot be used: */ + struct irq_work irq_work; + struct kthread_work work; + struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; + bool work_in_progress; + + bool need_freq_update; }; struct sugov_cpu { - struct update_util_data update_util; - struct sugov_policy *sg_policy; - unsigned int cpu; + struct update_util_data update_util; + struct sugov_policy *sg_policy; + unsigned int cpu; - bool iowait_boost_pending; - unsigned int iowait_boost; - unsigned int iowait_boost_max; + bool iowait_boost_pending; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; - /* The fields below are only needed when sharing a policy. */ - unsigned long util_cfs; - unsigned long util_dl; - unsigned long max; - unsigned int flags; + /* The fields below are only needed when sharing a policy: */ + unsigned long util_cfs; + unsigned long util_dl; + unsigned long max; - /* The field below is for single-CPU policies only. */ + /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON - unsigned long saved_idle_calls; + unsigned long saved_idle_calls; #endif }; @@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) /* * Since cpufreq_update_util() is called with rq->lock held for - * the @target_cpu, our per-cpu data is fully serialized. + * the @target_cpu, our per-CPU data is fully serialized. * - * However, drivers cannot in general deal with cross-cpu + * However, drivers cannot in general deal with cross-CPU * requests, so while get_next_freq() will work, our * sugov_update_commit() call may not for the fast switching platforms. * @@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) } delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->freq_update_delay_ns; } @@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { + struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util; + + if (rq->rt.rt_nr_running) { + util = sg_cpu->max; + } else { + util = sg_cpu->util_dl; + if (rq->cfs.h_nr_running) + util += sg_cpu->util_cfs; + } + /* * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); + return min(util, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { - if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { + if (flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } #endif /* CONFIG_NO_HZ_COMMON */ +/* + * Make sugov_should_update_freq() ignore the rate limit when DL + * has increased the utilization. + */ +static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) +{ + if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) + sg_policy->need_freq_update = true; +} + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time); + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; + ignore_dl_rate_limit(sg_cpu, sg_policy); + if (!sugov_should_update_freq(sg_policy, time)) return; busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT) { - next_f = policy->cpuinfo.max_freq; - } else { - sugov_get_util(sg_cpu); - max = sg_cpu->max; - util = sugov_aggregate_util(sg_cpu); - sugov_iowait_boost(sg_cpu, &util, &max); - next_f = get_next_freq(sg_policy, util, max); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - */ - if (busy && next_f < sg_policy->next_freq) { - next_f = sg_policy->next_freq; + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) { + next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; - } + /* Reset cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = 0; } + sugov_update_commit(sg_policy, time, next_f); } @@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) unsigned long j_util, j_max; s64 delta_ns; + sugov_get_util(j_sg_cpu); + /* * If the CFS CPU utilization was last updated before the * previous frequency update and the time elapsed between the @@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - j_sg_cpu->util_cfs = 0; - if (j_sg_cpu->util_dl == 0) - continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) - return policy->cpuinfo.max_freq; j_max = j_sg_cpu->max; j_util = sugov_aggregate_util(j_sg_cpu); + sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); if (j_util * max > j_max * util) { util = j_util; max = j_max; } - - sugov_iowait_boost(j_sg_cpu, &util, &max); } return get_next_freq(sg_policy, util, max); } -static void sugov_update_shared(struct update_util_data *hook, u64 time, - unsigned int flags) +static void +sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; @@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_lock(&sg_policy->update_lock); - sugov_get_util(sg_cpu); - sg_cpu->flags = flags; - - sugov_set_iowait_boost(sg_cpu, time); + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT) - next_f = sg_policy->policy->cpuinfo.max_freq; - else - next_f = sugov_next_freq_shared(sg_cpu, time); + ignore_dl_rate_limit(sg_cpu, sg_policy); + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } @@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) return sprintf(buf, "%u\n", tunables->rate_limit_us); } -static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, - size_t count) +static ssize_t +rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct sugov_tunables *tunables = to_sugov_tunables(attr_set); struct sugov_policy *sg_policy; @@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; struct sched_attr attr = { - .size = sizeof(struct sched_attr), - .sched_policy = SCHED_DEADLINE, - .sched_flags = SCHED_FLAG_SUGOV, - .sched_nice = 0, - .sched_priority = 0, + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, /* * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. @@ -662,21 +668,20 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; - sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; - sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; - sg_policy->work_in_progress = false; - sg_policy->need_freq_update = false; - sg_policy->cached_raw_freq = 0; + sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); memset(sg_cpu, 0, sizeof(*sg_cpu)); - sg_cpu->cpu = cpu; - sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + sg_cpu->cpu = cpu; + sg_cpu->sg_policy = sg_policy; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { @@ -720,14 +725,14 @@ static void sugov_limits(struct cpufreq_policy *policy) } static struct cpufreq_governor schedutil_gov = { - .name = "schedutil", - .owner = THIS_MODULE, - .dynamic_switching = true, - .init = sugov_init, - .exit = sugov_exit, - .start = sugov_start, - .stop = sugov_stop, - .limits = sugov_limits, + .name = "schedutil", + .owner = THIS_MODULE, + .dynamic_switching = true, + .init = sugov_init, + .exit = sugov_exit, + .start = sugov_start, + .stop = sugov_stop, + .limits = sugov_limits, }; #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 2511aba..daaadf9 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -14,7 +14,7 @@ * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with - * a 2 dimensional bitmap (the first for priority class, the second for cpus + * a 2 dimensional bitmap (the first for priority class, the second for CPUs * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a @@ -26,12 +26,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ - -#include <linux/gfp.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/slab.h> -#include "cpupri.h" +#include "sched.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) @@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, } /** - * cpupri_set - update the cpu priority setting + * cpupri_set - update the CPU priority setting * @cp: The cpupri context - * @cpu: The target cpu + * @cpu: The target CPU * @newpri: The priority (INVALID-RT99) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked @@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) return; /* - * If the cpu was currently mapped to a different value, we + * If the CPU was currently mapped to a different value, we * need to map it to the new value then remove the old value. * Note, we must add the new value first, otherwise we risk the * cpu being missed by the priority loop in cpupri_find. diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab0500..7dc20a3 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,32 +1,25 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CPUPRI_H -#define _LINUX_CPUPRI_H - -#include <linux/sched.h> #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) -#define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - atomic_t count; - cpumask_var_t mask; + atomic_t count; + cpumask_var_t mask; }; struct cpupri { - struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int *cpu_to_pri; + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, - struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #endif - -#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bac6ac9..0796f93 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,10 +1,6 @@ -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/tsacct_kern.h> -#include <linux/kernel_stat.h> -#include <linux/static_key.h> -#include <linux/context_tracking.h> -#include <linux/sched/cputime.h> +/* + * Simple CPU accounting cgroup controller + */ #include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, } /* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update + * Account user CPU time to a process. + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in user space since the last update */ void account_user_time(struct task_struct *p, u64 cputime) { @@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime) } /* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update + * Account guest CPU time to a process. + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in virtual machine since the last update */ void account_guest_time(struct task_struct *p, u64 cputime) { @@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) } /* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update + * Account system CPU time to a process and desired cpustat field + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in kernel space since the last update * @index: pointer to cpustat field that has to be updated */ void account_system_index_time(struct task_struct *p, @@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p, } /* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to + * Account system CPU time to a process. + * @p: the process that the CPU time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update + * @cputime: the CPU time spent in kernel space since the last update */ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) { @@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) /* * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait + * @cputime: the CPU time spent in involuntary wait */ void account_steal_time(u64 cputime) { @@ -216,7 +212,7 @@ void account_steal_time(u64 cputime) /* * Account for idle time. - * @cputime: the cpu time spent in idle wait + * @cputime: the CPU time spent in idle wait */ void account_idle_time(u64 cputime) { @@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to + * @p: the process that the CPU time gets accounted to * @user_tick: is the tick from userspace * @rq: the pointer to rq * @@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks) irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) {} + struct rq *rq, int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +# ifndef __ARCH_HAS_VTIME_TASK_SWITCH void vtime_common_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) @@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev) vtime_flush(prev); arch_vtime_task_switch(prev); } -#endif - +# endif #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) *ut = cputime.utime; *st = cputime.stime; } -#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ + /* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to + * Account a single tick of CPU time. + * @p: the process that the CPU time gets accounted to * @user_tick: indicates if the tick is a user or a system tick */ void account_process_tick(struct task_struct *p, int user_tick) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df0978..d1c7bf7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,9 +17,6 @@ */ #include "sched.h" -#include <linux/slab.h> -#include <uapi/linux/sched/types.h> - struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) @@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline @@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline @@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); -static inline void queue_push_tasks(struct rq *rq) +static inline void deadline_queue_push_tasks(struct rq *rq) { if (!has_pushable_dl_tasks(rq)) return; @@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); } -static inline void queue_pull_task(struct rq *rq) +static inline void deadline_queue_pull_task(struct rq *rq) { queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); } @@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p /* * If we cannot preempt any rq, fall back to pick any - * online cpu. + * online CPU: */ cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); if (cpu >= nr_cpu_ids) { /* - * Fail to find any suitable cpu. + * Failed to find any suitable CPU. * The task will never come back! */ BUG_ON(dl_bandwidth_enabled()); @@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq) { } -static inline void queue_push_tasks(struct rq *rq) +static inline void deadline_queue_push_tasks(struct rq *rq) { } -static inline void queue_pull_task(struct rq *rq) +static inline void deadline_queue_pull_task(struct rq *rq) { } #endif /* CONFIG_SMP */ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, - int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); /* * We are being explicitly informed that a new instance is starting, @@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); - queue_push_tasks(rq); + deadline_queue_push_tasks(rq); return p; } @@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) enqueue_pushable_dl_task(rq, p); } +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); @@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task) /* * We have to consider system topology and task affinity - * first, then we can look for a suitable cpu. + * first, then we can look for a suitable CPU. */ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) return -1; @@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task) * Now we check how well this matches with task's * affinity and system topology. * - * The last cpu where the task run is our first + * The last CPU where the task run is our first * guess, since it is most likely cache-hot there. */ if (cpumask_test_cpu(cpu, later_mask)) @@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task) best_cpu = cpumask_first_and(later_mask, sched_domain_span(sd)); /* - * Last chance: if a cpu being in both later_mask + * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our - * choice. Of course, the latest possible cpu is + * choice. Of course, the latest possible CPU is * already under consideration through later_mask. */ if (best_cpu < nr_cpu_ids) { @@ -2067,7 +2071,7 @@ retry: if (task == next_task) { /* * The task is still there. We don't try - * again, some other cpu will pull it when ready. + * again, some other CPU will pull it when ready. */ goto out; } @@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one - * from an overloaded cpu, if any. + * from an overloaded CPU, if any. */ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) return; - queue_pull_task(rq); + deadline_queue_pull_task(rq); } /* @@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) - queue_push_tasks(rq); + deadline_queue_push_tasks(rq); #endif if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, * or lowering its prio, so... */ if (!rq->dl.overloaded) - queue_pull_task(rq); + deadline_queue_pull_task(rq); /* * If we now have a earlier deadline task than p, @@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p) { struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; - dl_se->dl_deadline = 0; - dl_se->dl_period = 0; - dl_se->flags = 0; - dl_se->dl_bw = 0; - dl_se->dl_density = 0; + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + dl_se->dl_density = 0; - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - dl_se->dl_non_contending = 0; - dl_se->dl_overrun = 0; + dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) @@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { - unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, - cs_cpus_allowed); + unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; int cpus, ret; unsigned long flags; + dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(dest_cpu); overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) + if (overflow) { ret = -EBUSY; - else { + } else { /* * We reserve space for this task in the destination * root_domain, as we can't fail after this point. @@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + return ret; } @@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); + return ret; } @@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu) overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + return overflow; } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 72c401b..15b10e2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,7 +1,7 @@ /* * kernel/sched/debug.c * - * Print the CFS rbtree + * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar * @@ -9,16 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - -#include <linux/proc_fs.h> -#include <linux/sched/mm.h> -#include <linux/sched/task.h> -#include <linux/seq_file.h> -#include <linux/kallsyms.h> -#include <linux/utsname.h> -#include <linux/mempolicy.h> -#include <linux/debugfs.h> - #include "sched.h" static DEFINE_SPINLOCK(sched_debug_lock); @@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) if (table == NULL) return NULL; - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); + set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[13] is terminator */ return table; @@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } -static cpumask_var_t sd_sysctl_cpus; -static struct ctl_table_header *sd_sysctl_header; +static cpumask_var_t sd_sysctl_cpus; +static struct ctl_table_header *sd_sysctl_header; void register_sched_domain_sysctl(void) { @@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group { struct sched_entity *se = tg->se[cpu]; -#define P(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) -#define PN(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); + if (schedstat_enabled()) { PN_SCHEDSTAT(se->statistics.wait_start); PN_SCHEDSTAT(se->statistics.sleep_start); @@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN_SCHEDSTAT(se->statistics.wait_sum); P_SCHEDSTAT(se->statistics.wait_count); } + P(se->load.weight); P(se->runnable_weight); #ifdef CONFIG_SMP @@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg) return group_path; cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif @@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", + cfs_rq->avg.util_est.enqueued); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", @@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void) /* * This itererator needs some explanation. * It returns 1 for the header position. - * This means 2 is cpu 0. - * In a hotplugged system some cpus, including cpu 0, may be missing so we have - * to use cpumask_* to iterate over the cpus. + * This means 2 is CPU 0. + * In a hotplugged system some CPUs, including CPU 0, may be missing so we have + * to use cpumask_* to iterate over the CPUs. */ static void *sched_debug_start(struct seq_file *file, loff_t *offset) { @@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) if (n < nr_cpu_ids) return (void *)(unsigned long)(n + 2); + return NULL; } @@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) } static const struct seq_operations sched_debug_sops = { - .start = sched_debug_start, - .next = sched_debug_next, - .stop = sched_debug_stop, - .show = sched_debug_show, + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, }; static int sched_debug_release(struct inode *inode, struct file *file) @@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) #ifdef CONFIG_NUMA_BALANCING @@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); + P(se.avg.util_est.ewma); + P(se.avg.util_est.enqueued); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eb3ffc..0951d1c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,25 +20,10 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ - -#include <linux/sched/mm.h> -#include <linux/sched/topology.h> - -#include <linux/latencytop.h> -#include <linux/cpumask.h> -#include <linux/cpuidle.h> -#include <linux/slab.h> -#include <linux/profile.h> -#include <linux/interrupt.h> -#include <linux/mempolicy.h> -#include <linux/migrate.h> -#include <linux/task_work.h> -#include <linux/sched/isolation.h> +#include "sched.h" #include <trace/events/sched.h> -#include "sched.h" - /* * Targeted preemption latency for CPU-bound tasks: * @@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; #ifdef CONFIG_SMP /* - * For asym packing, by default the lower numbered cpu has higher priority. + * For asym packing, by default the lower numbered CPU has higher priority. */ int __weak arch_asym_cpu_priority(int cpu) { @@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se); + attach_entity_load_avg(cfs_rq, se, 0); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p) } /* - * The averaged statistics, shared & private, memory & cpu, + * The averaged statistics, shared & private, memory & CPU, * occupy the first half of the array. The second half of the * array is for current counters, which are averaged into the * first set by task_numa_placement. @@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env, * be incurred if the tasks were swapped. */ if (cur) { - /* Skip this swap candidate if cannot move to the source cpu */ + /* Skip this swap candidate if cannot move to the source CPU: */ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) goto unlock; @@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env, goto balance; } - /* Balance doesn't matter much if we're running a task per cpu */ + /* Balance doesn't matter much if we're running a task per CPU: */ if (imp > env->best_imp && src_rq->nr_running == 1 && dst_rq->nr_running == 1) goto assign; @@ -1676,7 +1661,7 @@ balance: */ if (!cur) { /* - * select_idle_siblings() uses an per-cpu cpumask that + * select_idle_siblings() uses an per-CPU cpumask that * can be used from IRQ context. */ local_irq_disable(); @@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; + unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - p->numa_migrate_retry = jiffies + interval; + numa_migrate_retry = jiffies + interval; + + /* + * Check that the new retry threshold is after the current one. If + * the retry is in the future, it implies that wake_affine has + * temporarily asked NUMA balancing to backoff from placement. + */ + if (numa_migrate_retry > p->numa_migrate_retry) + return; + + /* Safe to try placing the task on the preferred node */ + p->numa_migrate_retry = numa_migrate_retry; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio) } #ifdef CONFIG_FAIR_GROUP_SCHED -# ifdef CONFIG_SMP +#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) return clamp_t(long, runnable, MIN_SHARES, shares); } -# endif /* CONFIG_SMP */ +#endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq) { + if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * See cpu_util(). */ - cpufreq_update_util(rq, 0); + cpufreq_update_util(rq, flags); } } @@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna } /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + +/* * sched_entity: * * task: @@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + cfs_se_util_change(&se->avg); return 1; } @@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) } /* - * Called within set_task_rq() right before setting a task's cpu. The + * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, * including the state of rq->lock, should be made. */ @@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* * runnable_sum can't be lower than running_sum - * As running sum is scale with cpu capacity wehreas the runnable sum + * As running sum is scale with CPU capacity wehreas the runnable sum * is not we rescale running_sum 1st */ running_sum = se->avg.util_sum / @@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #endif if (decayed) - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); return decayed; } @@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, flags); } /** @@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); } /* @@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (!se->avg.last_update_time && (flags & DO_ATTACH)) { - attach_entity_load_avg(cfs_rq, se); + /* + * DO_ATTACH means we're here from enqueue_entity(). + * !last_update_time means we've passed through + * migrate_task_rq_fair() indicating we migrated. + * + * IOW we're enqueueing a task on a new CPU. + */ + attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); update_tg_load_avg(cfs_rq, 0); } else if (decayed && (flags & UPDATE_TG)) @@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) static int idle_balance(struct rq *this_rq, struct rq_flags *rf); +static inline unsigned long task_util(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_avg); +} + +static inline unsigned long _task_util_est(struct task_struct *p) +{ + struct util_est ue = READ_ONCE(p->se.avg.util_est); + + return max(ue.ewma, ue.enqueued); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ + return max(task_util(p), _task_util_est(p)); +} + +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); +} + +/* + * Check if a (signed) value is within a specified (unsigned) margin, + * based on the observation that: + * + * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) + * + * NOTE: this only works when value + maring < INT_MAX. + */ +static inline bool within_margin(int value, int margin) +{ + return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); +} + +static void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +{ + long last_ewma_diff; + struct util_est ue; + + if (!sched_feat(UTIL_EST)) + return; + + /* + * Update root cfs_rq's estimated utilization + * + * If *p is the last task then the root cfs_rq's estimated utilization + * of a CPU is 0 by definition. + */ + ue.enqueued = 0; + if (cfs_rq->nr_running) { + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + } + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + + /* + * Skip update of task's estimated utilization when the task has not + * yet completed an activation, e.g. being migrated. + */ + if (!task_sleep) + return; + + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + ue = p->se.avg.util_est; + if (ue.enqueued & UTIL_AVG_UNCHANGED) + return; + + /* + * Skip update of task's estimated utilization when its EWMA is + * already ~1% close to its last activation value. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + last_ewma_diff = ue.enqueued - ue.ewma; + if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + return; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by storing the current PELT value + * as ue.enqueued and by using this value to update the Exponential + * Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( last_ewma_diff ) + ewma(t-1) + * = w * (last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; + ue.ewma += last_ewma_diff; + ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + WRITE_ONCE(p->se.avg.util_est, ue); +} + #else /* CONFIG_SMP */ static inline int @@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); } static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) return 0; } +static inline void +util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} + #endif /* CONFIG_SMP */ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) add_nr_running(rq, task_delta); - /* determine whether we need to wake up potentially idle cpu */ + /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); } @@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) } /* - * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * Both these CPU hotplug callbacks race against unregister_fair_sched_group() * * The race is harmless, since modifying bandwidth settings of unhooked group * bits doesn't do much. @@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) */ cfs_rq->runtime_remaining = 1; /* - * Offline rq is schedulable till cpu is completely disabled + * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; @@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); + util_est_enqueue(&rq->cfs, p); hrtick_update(rq); } @@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); * * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n ticks (as it was idle) and update gets - * called on the n+1-th tick when cpu may be busy, then we have: + * If a CPU misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when CPU may be busy, then we have: * * load_n = (1 - 1/2^i)^n * load_0 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load @@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } + +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + int has_blocked; /* Idle CPUS has blocked load */ + unsigned long next_balance; /* in jiffy units */ + unsigned long next_blocked; /* Next update of blocked load in jiffies */ +} nohz ____cacheline_aligned; + #endif /* CONFIG_NO_HZ_COMMON */ /** @@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. * * Therefore we need to avoid the delta approach from the regular tick when @@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq) } /* - * Return a low guess at the load of a migration-source cpu weighted + * Return a low guess at the load of a migration-source CPU weighted * according to the scheduling class and "nice" value. * * We want to under-estimate the load of migration sources, to @@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type) } /* - * Return a high guess at the load of a migration-target cpu weighted + * Return a high guess at the load of a migration-target CPU weighted * according to the scheduling class and "nice" value. */ static unsigned long target_load(int cpu, int type) @@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, unsigned long task_load; this_eff_load = target_load(this_cpu, sd->wake_idx); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); if (sync) { unsigned long current_load = task_h_load(current); @@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); + prev_eff_load = source_load(prev_cpu, sd->wake_idx); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); - return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; + /* + * If sync, adjust the weight of prev_eff_load such that if + * prev_eff == this_eff that select_idle_sibling() will consider + * stacking the wakee on top of the waker if no other CPU is + * idle. + */ + if (sync) + prev_eff_load += 1; + + return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; +} + +#ifdef CONFIG_NUMA_BALANCING +static void +update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) +{ + unsigned long interval; + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + /* If balancing has no preference then continue gathering data */ + if (p->numa_preferred_nid == -1) + return; + + /* + * If the wakeup is not affecting locality then it is neutral from + * the perspective of NUMA balacing so continue gathering data. + */ + if (cpu_to_node(prev_cpu) == cpu_to_node(target)) + return; + + /* + * Temporarily prevent NUMA balancing trying to place waker/wakee after + * wakee has been moved by wake_affine. This will potentially allow + * related tasks to converge and update their data placement. The + * 4 * numa_scan_period is to allow the two-pass filter to migrate + * hot data to the wakers node. + */ + interval = max(sysctl_numa_balancing_scan_delay, + p->numa_scan_period << 2); + p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); + + interval = max(sysctl_numa_balancing_scan_delay, + current->numa_scan_period << 2); + current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); } +#else +static void +update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) +{ +} +#endif static int wake_affine(struct sched_domain *sd, struct task_struct *p, - int prev_cpu, int sync) + int this_cpu, int prev_cpu, int sync) { - int this_cpu = smp_processor_id(); int target = nr_cpumask_bits; if (sched_feat(WA_IDLE)) @@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (target == nr_cpumask_bits) return prev_cpu; + update_wa_numa_placement(p, prev_cpu, target); schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; } -static inline unsigned long task_util(struct task_struct *p); static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) @@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - /* Bias balancing toward cpus of our domain */ + /* Bias balancing toward CPUs of our domain */ if (local_group) load = source_load(i, load_idx); else @@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (min_runnable_load > (runnable_load + imbalance)) { /* * The runnable load is significantly smaller - * so we can pick this new cpu + * so we can pick this new CPU: */ min_runnable_load = runnable_load; min_avg_load = avg_load; @@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, (100*min_avg_load > imbalance_scale*avg_load)) { /* * The runnable loads are close so take the - * blocked load into account through avg_load. + * blocked load into account through avg_load: */ min_avg_load = avg_load; idlest = group; @@ -5903,6 +6116,18 @@ skip_spare: if (!idlest) return NULL; + /* + * When comparing groups across NUMA domains, it's possible for the + * local domain to be very lightly loaded relative to the remote + * domains but "imbalance" skews the comparison making remote CPUs + * look much more favourable. When considering cross-domain, add + * imbalance to the runnable load on the remote node and consider + * staying local. + */ + if ((sd->flags & SD_NUMA) && + min_runnable_load + imbalance >= this_runnable_load) + return NULL; + if (min_runnable_load > (this_runnable_load + imbalance)) return NULL; @@ -5914,7 +6139,7 @@ skip_spare: } /* - * find_idlest_group_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. */ static int find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) @@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p new_cpu = find_idlest_group_cpu(group, p, cpu); if (new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ + /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; continue; } - /* Now try balancing at a lower domain level of new_cpu */ + /* Now try balancing at a lower domain level of 'new_cpu': */ cpu = new_cpu; weight = sd->span_weight; sd = NULL; @@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p if (tmp->flags & sd_flag) sd = tmp; } - /* while loop will break here if sd == NULL */ } return new_cpu; @@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; /* - * If the previous cpu is cache affine and idle, don't be stupid. + * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; - /* Check a recently used CPU as a potential idle candidate */ + /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && @@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { /* * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake. + * candidate for the next wake: */ p->recent_used_cpu = prev; return recent_used_cpu; @@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). +/** + * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * @cpu: the CPU to get the utilization of + * + * The unit of the return value must be the one of capacity so we can compare + * the utilization with the capacity of the CPU that is available for CFS task + * (ie cpu_capacity). * * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the * recent utilization of currently non-runnable tasks on a CPU. It represents @@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * current capacity (capacity_curr <= capacity_orig) of the CPU because it is * the running time on this CPU scaled by capacity_curr. * + * The estimated utilization of a CPU is defined to be the maximum between its + * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks + * currently RUNNABLE on that CPU. + * This allows to properly represent the expected utilization of a CPU which + * has just got a big task running since a long sleep period. At the same time + * however it preserves the benefits of the "blocked utilization" in + * describing the potential for other tasks waking up on the same CPU. + * * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even * higher than capacity_orig because of unfortunate rounding in * cfs.avg.util_avg or just after migrating tasks and new task wakeups until @@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * available capacity. We allow utilization to overshoot capacity_curr (but not * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). + * + * Return: the (estimated) utilization for the specified CPU */ -static unsigned long cpu_util(int cpu) +static inline unsigned long cpu_util(int cpu) { - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); + struct cfs_rq *cfs_rq; + unsigned int util; - return (util >= capacity) ? capacity : util; -} + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); -static inline unsigned long task_util(struct task_struct *p) -{ - return p->se.avg.util_avg; + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return min_t(unsigned long, util, capacity_orig_of(cpu)); } /* - * cpu_util_wake: Compute cpu utilization with any contributions from + * cpu_util_wake: Compute CPU utilization with any contributions from * the waking task p removed. */ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { - unsigned long util, capacity; + struct cfs_rq *cfs_rq; + unsigned int util; /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); - capacity = capacity_orig_of(cpu); - util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); - return (util >= capacity) ? capacity : util; + /* Discount task's blocked util from CPU's util */ + util -= min_t(unsigned int, util, task_util(p)); + + /* + * Covered cases: + * + * a) if *p is the only task sleeping on this CPU, then: + * cpu_util (== task_util) > util_est (== 0) + * and thus we return: + * cpu_util_wake = (cpu_util - task_util) = 0 + * + * b) if other tasks are SLEEPING on this CPU, which is now exiting + * IDLE, then: + * cpu_util >= task_util + * cpu_util > util_est (== 0) + * and thus we discount *p's blocked utilization to return: + * cpu_util_wake = (cpu_util - task_util) >= 0 + * + * c) if other tasks are RUNNABLE on that CPU and + * util_est > cpu_util + * then we use util_est since it returns a more restrictive + * estimation of the spare capacity on that CPU, by just + * considering the expected utilization of tasks already + * runnable on that CPU. + * + * Cases a) and b) are covered by the above code, while case c) is + * covered by the following code when estimated utilization is + * enabled. + */ + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + /* + * Utilization (estimated) can exceed the CPU capacity, thus let's + * clamp to the maximum CPU capacity to ensure consistency with + * the cpu_util call. + */ + return min_t(unsigned long, util, capacity_orig_of(cpu)); } /* @@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balances load by selecting the idlest cpu in the idlest group, or under - * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. + * Balances load by selecting the idlest CPU in the idlest group, or under + * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * - * Returns the target cpu number. + * Returns the target CPU number. * * preempt must be disabled. */ @@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = wake_flags & WF_SYNC; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; /* - * If both cpu and prev_cpu are part of this domain, + * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && @@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (cpu == prev_cpu) goto pick_cpu; - new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); + new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); } if (sd && !(sd_flag & SD_BALANCE_FORK)) { @@ -6407,9 +6682,9 @@ pick_cpu: static void detach_entity_cfs_rq(struct sched_entity *se); /* - * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. + * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -6738,7 +7013,7 @@ simple: p = task_of(se); -done: __maybe_unused +done: __maybe_unused; #ifdef CONFIG_SMP /* * Move the next running task to the front of @@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * BASICS * * The purpose of load-balancing is to achieve the same basic fairness the - * per-cpu scheduler provides, namely provide a proportional amount of compute + * per-CPU scheduler provides, namely provide a proportional amount of compute * time to each task. This is expressed in the following equation: * * W_i,n/P_i == W_j,n/P_j for all i,j (1) * - * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight * W_i,0 is defined as: * * W_i,0 = \Sum_j w_i,j (2) * - * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous @@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * C_i is the compute capacity of cpu i, typically it is the + * C_i is the compute capacity of CPU i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * @@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * SCHED DOMAINS * * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) - * for all i,j solution, we create a tree of cpus that follows the hardware + * for all i,j solution, we create a tree of CPUs that follows the hardware * topology where each level pairs two lower groups (or better). This results - * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency - * of load-balance at each level inv. proportional to the number of cpus in + * of load-balance at each level inv. proportional to the number of CPUs in * the groups. * * This yields: @@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * \Sum { --- * --- * 2^i } = O(n) (5) * i = 0 2^i 2^i * `- size of each group - * | | `- number of cpus doing load-balance + * | | `- number of CPUs doing load-balance * | `- freq * `- sum over all levels * @@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * this makes (5) the runtime complexity of the balancer. * * An important property here is that each CPU is still (indirectly) connected - * to every other cpu in at most O(log n) steps: + * to every other CPU in at most O(log n) steps: * * The adjacency matrix of the resulting graph is given by: * @@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * A^(log_2 n)_i,j != 0 for all i,j (7) * - * Showing there's indeed a path between every cpu in at most O(log n) steps. + * Showing there's indeed a path between every CPU in at most O(log n) steps. * The task movement gives a factor of O(m), giving a convergence complexity * of: * @@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * WORK CONSERVING * * In order to avoid CPUs going idle while there's still work to do, new idle - * balancing is more aggressive and has the newly idle cpu iterate up the domain + * balancing is more aggressive and has the newly idle CPU iterate up the domain * tree itself instead of relying on other CPUs to bring it work. * * This adds some complexity to both (5) and (8) but it reduces the total idle @@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) * - * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. * * The big problem is S_k, its a global sum needed to compute a local (W_i) * property. @@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all }; #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 +#define LBF_NOHZ_STATS 0x10 +#define LBF_NOHZ_AGAIN 0x20 struct lb_env { struct sched_domain *sd; @@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags |= LBF_SOME_PINNED; /* - * Remember if this task can be migrated to any other cpu in + * Remember if this task can be migrated to any other CPU in * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * @@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; - /* Prevent to re-select dst_cpu via env's cpus */ + /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { env->flags |= LBF_DST_PINNED; @@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->avg.load_avg) + return true; + + if (cfs_rq->avg.util_avg) + return true; + + return false; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu) struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; + bool done = true; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu) */ if (cfs_rq_is_decayed(cfs_rq)) list_del_leaf_cfs_rq(cfs_rq); + + /* Don't need periodic decay once load/util_avg are null */ + if (cfs_rq_has_blocked(cfs_rq)) + done = false; } + +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; + if (done) + rq->has_blocked_load = 0; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; + if (!cfs_rq_has_blocked(cfs_rq)) + rq->has_blocked_load = 0; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_allowed constraints. * - * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a - * cpumask covering 1 cpu of the first group and 3 cpus of the second group. + * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a + * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. * Something like: * * { 0 1 2 3 } { 4 5 6 7 } @@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload - * cpu 3 and leave one of the cpus in the second group unused. + * cpu 3 and leave one of the CPUs in the second group unused. * * The current solution to this issue is detecting the skew in the first group * by noticing the lower domain failed to reach balance and had difficulty @@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group, return group_other; } +static bool update_nohz_stats(struct rq *rq, bool force) +{ +#ifdef CONFIG_NO_HZ_COMMON + unsigned int cpu = rq->cpu; + + if (!rq->has_blocked_load) + return false; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return false; + + if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) + return true; + + update_blocked_averages(cpu); + + return rq->has_blocked_load; +#else + return false; +#endif +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); - /* Bias balancing toward cpus of our domain */ + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) + env->flags |= LBF_NOHZ_AGAIN; + + /* Bias balancing toward CPUs of our domain: */ if (local_group) load = target_load(i, load_idx); else @@ -7902,7 +8231,7 @@ asym_packing: if (!(env->sd->flags & SD_ASYM_PACKING)) return true; - /* No ASYM_PACKING if target cpu is already busy */ + /* No ASYM_PACKING if target CPU is already busy */ if (env->idle == CPU_NOT_IDLE) return true; /* @@ -7915,7 +8244,7 @@ asym_packing: if (!sds->busiest) return true; - /* Prefer to move from lowest priority cpu's work */ + /* Prefer to move from lowest priority CPU's work */ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu)) return true; @@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; +#ifdef CONFIG_NO_HZ_COMMON + if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) + env->flags |= LBF_NOHZ_STATS; +#endif + load_idx = get_sd_load_idx(env->sd, env->idle); do { @@ -8024,6 +8358,15 @@ next_group: sg = sg->next; } while (sg != env->sd->groups); +#ifdef CONFIG_NO_HZ_COMMON + if ((env->flags & LBF_NOHZ_AGAIN) && + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { + + WRITE_ONCE(nohz.next_blocked, + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); + } +#endif + if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); @@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages - * to ensure cpu-load equilibrium, look at wider averages. XXX + * to ensure CPU-load equilibrium, look at wider averages. XXX */ busiest->load_per_task = min(busiest->load_per_task, sds->avg_load); @@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * If there aren't any idle cpus, avoid creating some. + * If there aren't any idle CPUs, avoid creating some. */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { @@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * We're trying to get all the cpus to the average_load, so we don't + * We're trying to get all the CPUs to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load. At the same time, + * reduce the max loaded CPU below the average load. At the same time, * we also don't want to reduce the group load below the group * capacity. Thus we look for the minimum possible imbalance. */ @@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group is not overloaded + * This CPU is idle. If the busiest group is not overloaded * and there is no imbalance between this and busiest group - * wrt idle cpus, it is balanced. The imbalance becomes + * wrt idle CPUs, it is balanced. The imbalance becomes * significant if the diff is greater than 1 otherwise we * might end up to just move the imbalance on another group */ @@ -8327,7 +8670,7 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the cpus in group. + * find_busiest_queue - find the busiest runqueue among the CPUs in the group. */ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) @@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu capacity. + * which is not scaled with the CPU capacity. */ if (rq->nr_running == 1 && wl > env->imbalance && @@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; /* - * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu capacity, so - * that the load can be moved away from the cpu that is + * For the load comparisons with the other CPU's, consider + * the weighted_cpuload() scaled with the CPU capacity, so + * that the load can be moved away from the CPU that is * potentially running at a lower capacity. * * Thus we're looking for max(wl_i / capacity_i), crosswise @@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env) return 0; /* - * In the newly idle case, we will allow all the cpu's + * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. */ if (env->idle == CPU_NEWLY_IDLE) return 1; - /* Try to find first idle cpu */ + /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { if (!idle_cpu(cpu)) continue; @@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env) balance_cpu = group_balance_cpu(sg); /* - * First idle cpu or the first cpu(busiest) in this sched group + * First idle CPU or the first CPU(busiest) in this sched group * is eligible for doing load balancing at this and above domains. */ return balance_cpu == env->dst_cpu; @@ -8580,7 +8923,7 @@ more_balance: * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group * where they can run. The upper limit on how many times we - * iterate on same src_cpu is dependent on number of cpus in our + * iterate on same src_cpu is dependent on number of CPUs in our * sched_group. * * This changes load balance semantics a bit on who can move @@ -8597,7 +8940,7 @@ more_balance: */ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { - /* Prevent to re-select dst_cpu via env's cpus */ + /* Prevent to re-select dst_cpu via env's CPUs */ cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); @@ -8659,9 +9002,10 @@ more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); - /* don't kick the active_load_balance_cpu_stop, - * if the curr task on busiest cpu can't be - * moved to this_cpu + /* + * Don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest CPU can't be + * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { raw_spin_unlock_irqrestore(&busiest->lock, @@ -8773,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) } /* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) -{ - unsigned long next_balance = jiffies + HZ; - int this_cpu = this_rq->cpu; - struct sched_domain *sd; - int pulled_task = 0; - u64 curr_cost = 0; - - /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - this_rq->idle_stamp = rq_clock(this_rq); - - /* - * Do not pull tasks towards !active CPUs... - */ - if (!cpu_active(this_cpu)) - return 0; - - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - rq_unpin_lock(this_rq, rf); - - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (sd) - update_next_balance(sd, &next_balance); - rcu_read_unlock(); - - goto out; - } - - raw_spin_unlock(&this_rq->lock); - - update_blocked_averages(this_cpu); - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - int continue_balancing = 1; - u64 t0, domain_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); - break; - } - - if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - - pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, - &continue_balancing); - - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; - - curr_cost += domain_cost; - } - - update_next_balance(sd, &next_balance); - - /* - * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. - */ - if (pulled_task || this_rq->nr_running > 0) - break; - } - rcu_read_unlock(); - - raw_spin_lock(&this_rq->lock); - - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - - /* - * While browsing the domains, we released the rq lock, a task could - * have been enqueued in the meantime. Since we're not going idle, - * pretend we pulled a task. - */ - if (this_rq->cfs.h_nr_running && !pulled_task) - pulled_task = 1; - -out: - /* Move the next balance forward */ - if (time_after(this_rq->next_balance, next_balance)) - this_rq->next_balance = next_balance; - - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) - pulled_task = -1; - - if (pulled_task) - this_rq->idle_stamp = 0; - - rq_repin_lock(this_rq, rf); - - return pulled_task; -} - -/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at * least 1 task to be running on each physical CPU where possible, and * avoids physical / logical imbalances. @@ -8911,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data) if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) goto out_unlock; - /* make sure the requested cpu hasn't gone down in the meantime */ + /* Make sure the requested CPU hasn't gone down in the meantime: */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) goto out_unlock; @@ -8923,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data) /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. + * Bjorn Helgaas on a 128-CPU setup. */ BUG_ON(busiest_rq == target_rq); @@ -8977,141 +9207,6 @@ out_unlock: return 0; } -static inline int on_null_domain(struct rq *rq) -{ - return unlikely(!rcu_dereference_sched(rq->sd)); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing - * needed, they will kick the idle load balancer, which then does idle - * load balancing for all the idle CPUs. - */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; - -static inline int find_new_ilb(void) -{ - int ilb = cpumask_first(nohz.idle_cpus_mask); - - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; - - return nr_cpu_ids; -} - -/* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). - */ -static void nohz_balancer_kick(void) -{ - int ilb_cpu; - - nohz.next_balance++; - - ilb_cpu = find_new_ilb(); - - if (ilb_cpu >= nr_cpu_ids) - return; - - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) - return; - /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target cpu which - * is idle. And the softirq performing nohz idle load balance - * will be run before returning from the IPI. - */ - smp_send_reschedule(ilb_cpu); - return; -} - -void nohz_balance_exit_idle(unsigned int cpu) -{ - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - /* - * Completely isolated CPUs don't ever set, so we must test. - */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } -} - -static inline void set_cpu_sd_state_busy(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || !sd->nohz_idle) - goto unlock; - sd->nohz_idle = 0; - - atomic_inc(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - -void set_cpu_sd_state_idle(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || sd->nohz_idle) - goto unlock; - sd->nohz_idle = 1; - - atomic_dec(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - -/* - * This routine will record that the cpu is going idle with tick stopped. - * This info will be used in performing idle load balancing in the future. - */ -void nohz_balance_enter_idle(int cpu) -{ - /* - * If this cpu is going down, then nothing needs to be done. - */ - if (!cpu_active(cpu)) - return; - - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) - return; - - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) - return; - - /* - * If we're a completely isolated CPU, we don't play. - */ - if (on_null_domain(cpu_rq(cpu))) - return; - - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); -} -#endif - static DEFINE_SPINLOCK(balancing); /* @@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) int need_serialize, need_decay = 0; u64 max_cost = 0; - update_blocked_averages(cpu); - rcu_read_lock(); for_each_domain(cpu, sd) { /* @@ -9232,68 +9325,56 @@ out: } } +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + #ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * idle load balancing details + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. */ -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) -{ - int this_cpu = this_rq->cpu; - struct rq *rq; - int balance_cpu; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - if (idle != CPU_IDLE || - !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) - goto end; +static inline int find_new_ilb(void) +{ + int ilb = cpumask_first(nohz.idle_cpus_mask); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) - continue; + if (ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; + return nr_cpu_ids; +} - rq = cpu_rq(balance_cpu); +/* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void kick_ilb(unsigned int flags) +{ + int ilb_cpu; - /* - * If time for next balance is due, - * do the balance. - */ - if (time_after_eq(jiffies, rq->next_balance)) { - struct rq_flags rf; + nohz.next_balance++; - rq_lock_irq(rq, &rf); - update_rq_clock(rq); - cpu_load_update_idle(rq); - rq_unlock_irq(rq, &rf); + ilb_cpu = find_new_ilb(); - rebalance_domains(rq, CPU_IDLE); - } + if (ilb_cpu >= nr_cpu_ids) + return; - if (time_after(next_balance, rq->next_balance)) { - next_balance = rq->next_balance; - update_next_balance = 1; - } - } + flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); + if (flags & NOHZ_KICK_MASK) + return; /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target CPU which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; -end: - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); + smp_send_reschedule(ilb_cpu); } /* @@ -9307,36 +9388,41 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static void nohz_balancer_kick(struct rq *rq) { unsigned long now = jiffies; struct sched_domain_shared *sds; struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; - bool kick = false; + unsigned int flags = 0; if (unlikely(rq->idle_balance)) - return false; + return; - /* - * We may be recently in ticked or tickless idle mode. At the first - * busy tick after returning from idle, we will update the busy stats. - */ - set_cpu_sd_state_busy(); - nohz_balance_exit_idle(cpu); + /* + * We may be recently in ticked or tickless idle mode. At the first + * busy tick after returning from idle, we will update the busy stats. + */ + nohz_balance_exit_idle(rq); /* * None are in tickless mode and hence no need for NOHZ idle load * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return false; + return; + + if (READ_ONCE(nohz.has_blocked) && + time_after(now, READ_ONCE(nohz.next_blocked))) + flags = NOHZ_STATS_KICK; if (time_before(now, nohz.next_balance)) - return false; + goto out; - if (rq->nr_running >= 2) - return true; + if (rq->nr_running >= 2) { + flags = NOHZ_KICK_MASK; + goto out; + } rcu_read_lock(); sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); @@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq) */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } @@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq) if (sd) { if ((rq->cfs.h_nr_running >= 1) && check_cpu_capacity(rq, sd)) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } } @@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq) continue; if (sched_asym_prefer(i, cpu)) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } } } unlock: rcu_read_unlock(); - return kick; +out: + if (flags) + kick_ilb(flags); +} + +static void set_cpu_sd_state_busy(int cpu) +{ + struct sched_domain *sd; + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + atomic_inc(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +void nohz_balance_exit_idle(struct rq *rq) +{ + SCHED_WARN_ON(rq != this_rq()); + + if (likely(!rq->nohz_tick_stopped)) + return; + + rq->nohz_tick_stopped = 0; + cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + + set_cpu_sd_state_busy(rq->cpu); +} + +static void set_cpu_sd_state_idle(int cpu) +{ + struct sched_domain *sd; + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + atomic_dec(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +/* + * This routine will record that the CPU is going idle with tick stopped. + * This info will be used in performing idle load balancing in the future. + */ +void nohz_balance_enter_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + SCHED_WARN_ON(cpu != smp_processor_id()); + + /* If this CPU is going down, then nothing needs to be done: */ + if (!cpu_active(cpu)) + return; + + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) + return; + + /* + * Can be set safely without rq->lock held + * If a clear happens, it will have evaluated last additions because + * rq->lock is held during the check and the clear + */ + rq->has_blocked_load = 1; + + /* + * The tick is still stopped but load could have been added in the + * meantime. We set the nohz.has_blocked flag to trig a check of the + * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear + * of nohz.has_blocked can only happen after checking the new load + */ + if (rq->nohz_tick_stopped) + goto out; + + /* If we're a completely isolated CPU, we don't play: */ + if (on_null_domain(rq)) + return; + + rq->nohz_tick_stopped = 1; + + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + + /* + * Ensures that if nohz_idle_balance() fails to observe our + * @idle_cpus_mask store, it must observe the @has_blocked + * store. + */ + smp_mb__after_atomic(); + + set_cpu_sd_state_idle(cpu); + +out: + /* + * Each time a cpu enter idle, we assume that it has blocked load and + * enable the periodic update of the load of idle cpus + */ + WRITE_ONCE(nohz.has_blocked, 1); +} + +/* + * Internal function that runs load balance for all idle cpus. The load balance + * can be a simple update of blocked load or a complete load balance with + * tasks movement depending of flags. + * The function returns false if the loop has stopped before running + * through all idle CPUs. + */ +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, + enum cpu_idle_type idle) +{ + /* Earliest time when we have to do rebalance again */ + unsigned long now = jiffies; + unsigned long next_balance = now + 60*HZ; + bool has_blocked_load = false; + int update_next_balance = 0; + int this_cpu = this_rq->cpu; + int balance_cpu; + int ret = false; + struct rq *rq; + + SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + + /* + * We assume there will be no idle load after this update and clear + * the has_blocked flag. If a cpu enters idle in the mean time, it will + * set the has_blocked flag and trig another update of idle load. + * Because a cpu that becomes idle, is added to idle_cpus_mask before + * setting the flag, we are sure to not clear the state and not + * check the load of an idle cpu. + */ + WRITE_ONCE(nohz.has_blocked, 0); + + /* + * Ensures that if we miss the CPU, we must see the has_blocked + * store from nohz_balance_enter_idle(). + */ + smp_mb(); + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + continue; + + /* + * If this CPU gets work to do, stop the load balancing + * work being done for other CPUs. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + has_blocked_load = true; + goto abort; + } + + rq = cpu_rq(balance_cpu); + + has_blocked_load |= update_nohz_stats(rq, true); + + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + cpu_load_update_idle(rq); + rq_unlock_irqrestore(rq, &rf); + + if (flags & NOHZ_BALANCE_KICK) + rebalance_domains(rq, CPU_IDLE); + } + + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } + } + + /* Newly idle CPU doesn't need an update */ + if (idle != CPU_NEWLY_IDLE) { + update_blocked_averages(this_cpu); + has_blocked_load |= this_rq->has_blocked_load; + } + + if (flags & NOHZ_BALANCE_KICK) + rebalance_domains(this_rq, CPU_IDLE); + + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + + /* The full idle balance loop has been done */ + ret = true; + +abort: + /* There is still blocked load, enable periodic update */ + if (has_blocked_load) + WRITE_ONCE(nohz.has_blocked, 1); + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + + return ret; +} + +/* + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + int this_cpu = this_rq->cpu; + unsigned int flags; + + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + return false; + + if (idle != CPU_IDLE) { + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + return false; + } + + /* + * barrier, pairs with nohz_balance_enter_idle(), ensures ... + */ + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + if (!(flags & NOHZ_KICK_MASK)) + return false; + + _nohz_idle_balance(this_rq, flags, idle); + + return true; +} + +static void nohz_newidle_balance(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; + + /* + * This CPU doesn't want to be disturbed by scheduler + * housekeeping + */ + if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + return; + + /* Will wake up very soon. No time for doing anything else*/ + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + + /* Don't need to update blocked load of idle CPUs*/ + if (!READ_ONCE(nohz.has_blocked) || + time_before(jiffies, READ_ONCE(nohz.next_blocked))) + return; + + raw_spin_unlock(&this_rq->lock); + /* + * This CPU is going to be idle and blocked load of idle CPUs + * need to be updated. Run the ilb locally as it is a good + * candidate for ilb instead of waking up another idle CPU. + * Kick an normal ilb if we failed to do the update. + */ + if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) + kick_ilb(NOHZ_STATS_KICK); + raw_spin_lock(&this_rq->lock); +} + +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void nohz_balancer_kick(struct rq *rq) { } + +static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + return false; +} + +static inline void nohz_newidle_balance(struct rq *this_rq) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; + struct sched_domain *sd; + int pulled_task = 0; + u64 curr_cost = 0; + + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + + /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, &next_balance); + rcu_read_unlock(); + + nohz_newidle_balance(this_rq); + + goto out; + } + + raw_spin_unlock(&this_rq->lock); + + update_blocked_averages(this_cpu); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + int continue_balancing = 1; + u64 t0, domain_cost; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, &next_balance); + break; + } + + if (sd->flags & SD_BALANCE_NEWIDLE) { + t0 = sched_clock_cpu(this_cpu); + + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, + &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; + } + + update_next_balance(sd, &next_balance); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) + break; + } + rcu_read_unlock(); + + raw_spin_lock(&this_rq->lock); + + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) + this_rq->next_balance = next_balance; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) + this_rq->idle_stamp = 0; + + rq_repin_lock(this_rq, rf); + + return pulled_task; } -#else -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } -#endif /* * run_rebalance_domains is triggered when needed from the scheduler tick. @@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) CPU_IDLE : CPU_NOT_IDLE; /* - * If this cpu has a pending nohz_balance_kick, then do the - * balancing on behalf of the other idle cpus whose ticks are + * If this CPU has a pending nohz_balance_kick, then do the + * balancing on behalf of the other idle CPUs whose ticks are * stopped. Do nohz_idle_balance *before* rebalance_domains to - * give the idle cpus a chance to load balance. Else we may + * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. */ - nohz_idle_balance(this_rq, idle); + if (nohz_idle_balance(this_rq, idle)) + return; + + /* normal load balance */ + update_blocked_averages(this_rq->cpu); rebalance_domains(this_rq, idle); } @@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); -#endif + + nohz_balancer_kick(rq); } static void rq_online_fair(struct rq *rq) @@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ /* - * scheduler tick hitting a task of our scheduling class: + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { @@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se); + attach_entity_load_avg(cfs_rq, se, 0); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } @@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; + nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif #endif /* SMP */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5..85ae848 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7dae9eb..2975f19 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,23 +1,14 @@ /* - * Generic entry point for the idle threads + * Generic entry points for the idle threads and + * implementation of the idle task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE batch scheduled + * tasks which are handled in sched/fair.c ) */ -#include <linux/sched.h> -#include <linux/sched/idle.h> -#include <linux/cpu.h> -#include <linux/cpuidle.h> -#include <linux/cpuhotplug.h> -#include <linux/tick.h> -#include <linux/mm.h> -#include <linux/stackprotector.h> -#include <linux/suspend.h> -#include <linux/livepatch.h> - -#include <asm/tlb.h> +#include "sched.h" #include <trace/events/power.h> -#include "sched.h" - /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) static int __init cpu_idle_poll_setup(char *__unused) { cpu_idle_force_poll = 1; + return 1; } __setup("nohlt", cpu_idle_poll_setup); @@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); static int __init cpu_idle_nopoll_setup(char *__unused) { cpu_idle_force_poll = 0; + return 1; } __setup("hlt", cpu_idle_nopoll_setup); @@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); stop_critical_timings(); + while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); + return 1; } @@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state) { /* * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (arm and sh have never invoked the canary - * init for the non boot cpus!). Will be fixed in 3.11 + * make this generic (ARM and SH have never invoked the canary + * init for the non boot CPUs!). Will be fixed in 3.11 */ #ifdef CONFIG_X86 /* @@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state) while (1) do_idle(); } + +/* + * idle-task scheduling class. + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + return task_cpu(p); /* IDLE tasks as never migrated */ +} +#endif + +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +{ + resched_curr(rq); +} + +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + put_prev_task(rq, prev); + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); + + return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) +{ + raw_spin_unlock_irq(&rq->lock); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + raw_spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ +static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_idle(struct rq *rq) +{ +} + +static void switched_to_idle(struct rq *rq, struct task_struct *p) +{ + BUG(); +} + +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +{ + BUG(); +} + +static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +static void update_curr_idle(struct rq *rq) +{ +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +const struct sched_class idle_sched_class = { + /* .next is NULL */ + /* no enqueue/yield_task for idle tasks */ + + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + + .check_preempt_curr = check_preempt_curr_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, +#endif + + .set_curr_task = set_curr_task_idle, + .task_tick = task_tick_idle, + + .get_rr_interval = get_rr_interval_idle, + + .prio_changed = prio_changed_idle, + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, +}; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c deleted file mode 100644 index d518664..0000000 --- a/kernel/sched/idle_task.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - -/* - * idle-task scheduling class. - * - * (NOTE: these are not related to SCHED_IDLE tasks which are - * handled in sched/fair.c) - */ - -#ifdef CONFIG_SMP -static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) -{ - return task_cpu(p); /* IDLE tasks as never migrated */ -} -#endif /* CONFIG_SMP */ - -/* - * Idle tasks are unconditionally rescheduled: - */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) -{ - resched_curr(rq); -} - -static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - put_prev_task(rq, prev); - update_idle_core(rq); - schedstat_inc(rq->sched_goidle); - return rq->idle; -} - -/* - * It is not legal to sleep in the idle task - print a warning - * message if some code attempts to do it: - */ -static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) -{ - raw_spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - raw_spin_lock_irq(&rq->lock); -} - -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ - rq_last_tick_reset(rq); -} - -static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) -{ -} - -static void set_curr_task_idle(struct rq *rq) -{ -} - -static void switched_to_idle(struct rq *rq, struct task_struct *p) -{ - BUG(); -} - -static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) -{ - BUG(); -} - -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - -static void update_curr_idle(struct rq *rq) -{ -} - -/* - * Simple, special scheduling class for the per-CPU idle tasks: - */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ - /* no enqueue/yield_task for idle tasks */ - - /* dequeue is not valid, we print a debug message there: */ - .dequeue_task = dequeue_task_idle, - - .check_preempt_curr = check_preempt_curr_idle, - - .pick_next_task = pick_next_task_idle, - .put_prev_task = put_prev_task_idle, - -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_idle, - .set_cpus_allowed = set_cpus_allowed_common, -#endif - - .set_curr_task = set_curr_task_idle, - .task_tick = task_tick_idle, - - .get_rr_interval = get_rr_interval_idle, - - .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, -}; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436..e680218 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -3,15 +3,10 @@ * any CPU: unbound workqueues, timers, kthreads and any offloadable work. * * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ - -#include <linux/sched/isolation.h> -#include <linux/tick.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/static_key.h> -#include <linux/ctype.h> +#include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); @@ -60,6 +55,9 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overriden); + if (housekeeping_flags & HK_FLAG_TICK) + sched_tick_offload_init(); + /* We need at least one CPU to handle housekeeping work */ WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } @@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; return housekeeping_setup(str, flags); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 89a989e..a171c12 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,10 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ - -#include <linux/export.h> -#include <linux/sched/loadavg.h> - #include "sched.h" /* @@ -32,29 +28,29 @@ * Due to a number of reasons the above turns in the mess below: * * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach + * serious number of CPUs, therefore we need to take a distributed approach * to calculating nr_active. * * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } * * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate + * can simply take per-CPU deltas and fold those into a global accumulate * to obtain the same result. See calc_load_fold_active(). * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * Furthermore, in order to avoid synchronizing all per-CPU delta folding * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. + * CPU to have completed this task. * * This places an upper-bound on the IRQ-off latency of the machine. Then * again, being late doesn't loose the delta, just wrecks the sample. * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because + * this would add another cross-CPU cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever CPU the task ran + * when it went into uninterruptible state and decrement on whatever CPU * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. + * all CPUs yields the correct result. * * This covers the NO_HZ=n code, for extra head-aches, see the comment below. */ @@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * Handle NO_HZ for the global load-average. * * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by + * load-average relies on per-CPU sampling from the tick, it is affected by * NO_HZ. * * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * entering NO_HZ state such that we can include this as an 'extra' CPU delta * when we read the global state. * * Obviously reality has to ruin such a delightfully simple scheme: @@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which + * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ * intervals. * * When making the ILB scale, we should try to pull this in as well. @@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp, } /* - * NO_HZ can leave us missing all per-cpu ticks calling + * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. @@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ delta = calc_load_nohz_fold(); if (delta) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 5d07626..76e0eaf 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -13,32 +13,25 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - -#include <linux/syscalls.h> -#include <linux/membarrier.h> -#include <linux/tick.h> -#include <linux/cpumask.h> -#include <linux/atomic.h> - -#include "sched.h" /* for cpu_rq(). */ +#include "sched.h" /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE -#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ - (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) #else #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif -#define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ - | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) static void ipi_mb(void *info) @@ -85,6 +78,7 @@ static int membarrier_global_expedited(void) */ if (cpu == raw_smp_processor_id()) continue; + rcu_read_lock(); p = task_rcu_dereference(&cpu_rq(cpu)->curr); if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & @@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags) * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ + return 0; } @@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void) } atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); + return 0; } @@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags) synchronize_sched(); } atomic_or(state, &mm->membarrier_state); + return 0; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451..86b7798 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,12 +3,8 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ - #include "sched.h" -#include <linux/slab.h> -#include <linux/irq_work.h> - int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); -static inline void queue_push_tasks(struct rq *rq) +static inline void rt_queue_push_tasks(struct rq *rq) { if (!has_pushable_tasks(rq)) return; @@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); } -static inline void queue_pull_task(struct rq *rq) +static inline void rt_queue_pull_task(struct rq *rq) { queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); } @@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) { } -static inline void queue_push_tasks(struct rq *rq) +static inline void rt_queue_push_tasks(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_RT); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); } static void @@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) add_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 1; + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); } #if defined CONFIG_SMP @@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) return; /* - * There appears to be other cpus that can accept - * current and none to run 'p', so lets reschedule - * to try and push current away: + * There appear to be other CPUs that can accept + * the current task but none can run 'p', so lets reschedule + * to try and push the current task away: */ requeue_task_rt(rq, p, 1); resched_curr(rq); @@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); - queue_push_tasks(rq); + rt_queue_push_tasks(rq); return p; } @@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) if (!task_running(rq, p) && cpumask_test_cpu(cpu, &p->cpus_allowed)) return 1; + return 0; } /* * Return the highest pushable rq's task, which is suitable to be executed - * on the cpu, NULL otherwise + * on the CPU, NULL otherwise */ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { @@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task) return -1; /* No targets found */ /* - * At this point we have built a mask of cpus representing the + * At this point we have built a mask of CPUs representing the * lowest priority tasks in the system. Now we want to elect * the best one based on our affinity and topology. * - * We prioritize the last cpu that the task executed on since + * We prioritize the last CPU that the task executed on since * it is most likely cache-hot in that location. */ if (cpumask_test_cpu(cpu, lowest_mask)) @@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task) /* * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. + * out which CPU is logically closest to our hot cache data. */ if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ @@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task) cpu = cpumask_any(lowest_mask); if (cpu < nr_cpu_ids) return cpu; + return -1; } @@ -1827,7 +1828,7 @@ retry: * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue * to push it to. Do not retry in this case, since - * other cpus will pull from us when ready. + * other CPUs will pull from us when ready. */ goto out; } @@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd) * rt_next_cpu() will simply return the first CPU found in * the rto_mask. * - * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * If rto_next_cpu() is called with rto_cpu is a valid CPU, it * will return the next CPU found in the rto_mask. * * If there are no more CPUs left in the rto_mask, then a check is made @@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq) raw_spin_lock(&rq->rd->rto_lock); /* - * The rto_cpu is updated under the lock, if it has a valid cpu + * The rto_cpu is updated under the lock, if it has a valid CPU * then the IPI is still running and will continue due to the * update to loop_next, and nothing needs to be done here. * Otherwise it is finishing up and an ipi needs to be sent. @@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq) /* * There's a chance that p is higher in priority - * than what's currently running on its cpu. + * than what's currently running on its CPU. * This is just that p is wakeing up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the @@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; - queue_pull_task(rq); + rt_queue_pull_task(rq); } void __init init_sched_rt_class(void) @@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) - queue_push_tasks(rq); + rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); @@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * may need to pull tasks to this runqueue. */ if (oldprio < p->prio) - queue_pull_task(rq); + rt_queue_pull_task(rq); /* * If there's a higher priority task waiting to run @@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) static inline void watchdog(struct rq *rq, struct task_struct *p) { } #endif +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { struct sched_rt_entity *rt_se = &p->rt; @@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write, msecs_to_jiffies(sysctl_sched_rr_timeslice); } mutex_unlock(&mutex); + return ret; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb5fc45..c3deaee 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,39 +1,73 @@ /* SPDX-License-Identifier: GPL-2.0 */ - +/* + * Scheduler internal types and methods: + */ #include <linux/sched.h> + #include <linux/sched/autogroup.h> -#include <linux/sched/sysctl.h> -#include <linux/sched/topology.h> -#include <linux/sched/rt.h> -#include <linux/sched/deadline.h> #include <linux/sched/clock.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/signal.h> -#include <linux/sched/numa_balancing.h> -#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> #include <linux/sched/cpufreq.h> -#include <linux/sched/stat.h> -#include <linux/sched/nohz.h> +#include <linux/sched/cputime.h> +#include <linux/sched/deadline.h> #include <linux/sched/debug.h> #include <linux/sched/hotplug.h> +#include <linux/sched/idle.h> +#include <linux/sched/init.h> +#include <linux/sched/isolation.h> +#include <linux/sched/jobctl.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/nohz.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/prio.h> +#include <linux/sched/rt.h> +#include <linux/sched/signal.h> +#include <linux/sched/stat.h> +#include <linux/sched/sysctl.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> -#include <linux/sched/cputime.h> -#include <linux/sched/init.h> +#include <linux/sched/topology.h> +#include <linux/sched/user.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/xacct.h> + +#include <uapi/linux/sched/types.h> -#include <linux/u64_stats_sync.h> -#include <linux/kernel_stat.h> #include <linux/binfmts.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/compat.h> +#include <linux/context_tracking.h> +#include <linux/cpufreq.h> +#include <linux/cpuidle.h> +#include <linux/cpuset.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/delayacct.h> +#include <linux/init_task.h> +#include <linux/kprobes.h> +#include <linux/kthread.h> +#include <linux/membarrier.h> +#include <linux/migrate.h> +#include <linux/mmu_context.h> +#include <linux/nmi.h> +#include <linux/proc_fs.h> +#include <linux/prefetch.h> +#include <linux/profile.h> +#include <linux/rcupdate_wait.h> +#include <linux/security.h> +#include <linux/stackprotector.h> #include <linux/stop_machine.h> -#include <linux/irq_work.h> -#include <linux/tick.h> -#include <linux/slab.h> -#include <linux/cgroup.h> +#include <linux/suspend.h> +#include <linux/swait.h> +#include <linux/syscalls.h> +#include <linux/task_work.h> +#include <linux/tsacct_kern.h> + +#include <asm/tlb.h> #ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> +# include <asm/paravirt.h> #endif #include "cpupri.h" @@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased - * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are - * pretty high and the returns do not justify the increased costs. + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. * - * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to - * increase coverage and consistency always enable it on 64bit platforms. + * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to + * increase coverage and consistency always enable it on 64-bit platforms. */ #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) @@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } * 10 -> just above 1us * 9 -> just above 0.5us */ -#define DL_SCALE (10) +#define DL_SCALE 10 /* - * These are the 'tuning knobs' of the scheduler: + * Single value that denotes runtime == period, ie unlimited time. */ - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) +#define RUNTIME_INF ((u64)~0ULL) static inline int idle_policy(int policy) { @@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p); * control. */ struct dl_bandwidth { - raw_spinlock_t dl_runtime_lock; - u64 dl_runtime; - u64 dl_period; + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; }; static inline int dl_bandwidth_enabled(void) @@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void) } struct dl_bw { - raw_spinlock_t lock; - u64 bw, total_bw; + raw_spinlock_t lock; + u64 bw; + u64 total_bw; }; static inline void __dl_update(struct dl_bw *dl_b, s64 bw); @@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } -void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); -extern int sched_dl_global_validate(void); +extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); -extern int sched_dl_overflow(struct task_struct *p, int policy, - const struct sched_attr *attr); +extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed); -extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, - const struct cpumask *trial); +extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED @@ -300,32 +328,36 @@ extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH - raw_spinlock_t lock; - ktime_t period; - u64 quota, runtime; - s64 hierarchical_quota; - u64 runtime_expires; - - int idle, period_active; - struct hrtimer period_timer, slack_timer; - struct list_head throttled_cfs_rq; - - /* statistics */ - int nr_periods, nr_throttled; - u64 throttled_time; + raw_spinlock_t lock; + ktime_t period; + u64 quota; + u64 runtime; + s64 hierarchical_quota; + u64 runtime_expires; + + int idle; + int period_active; + struct hrtimer period_timer; + struct hrtimer slack_timer; + struct list_head throttled_cfs_rq; + + /* Statistics: */ + int nr_periods; + int nr_throttled; + u64 throttled_time; #endif }; -/* task group related information */ +/* Task group related information */ struct task_group { struct cgroup_subsys_state css; #ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; #ifdef CONFIG_SMP /* @@ -333,29 +365,29 @@ struct task_group { * it in its own cacheline separated from the fields above which * will also be accessed at each tick. */ - atomic_long_t load_avg ____cacheline_aligned; + atomic_long_t load_avg ____cacheline_aligned; #endif #endif #ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; - struct rt_bandwidth rt_bandwidth; + struct rt_bandwidth rt_bandwidth; #endif - struct rcu_head rcu; - struct list_head list; + struct rcu_head rcu; + struct list_head list; - struct task_group *parent; - struct list_head siblings; - struct list_head children; + struct task_group *parent; + struct list_head siblings; + struct list_head children; #ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; + struct autogroup *autogroup; #endif - struct cfs_bandwidth cfs_bandwidth; + struct cfs_bandwidth cfs_bandwidth; }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -369,8 +401,8 @@ struct task_group { * (The default weight is 1024 - so there's no practical * limitation from this.) */ -#define MIN_SHARES (1UL << 1) -#define MAX_SHARES (1UL << 18) +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) #endif typedef int (*tg_visitor)(struct task_group *, void *); @@ -443,35 +475,39 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { - struct load_weight load; - unsigned long runnable_weight; - unsigned int nr_running, h_nr_running; + struct load_weight load; + unsigned long runnable_weight; + unsigned int nr_running; + unsigned int h_nr_running; - u64 exec_clock; - u64 min_vruntime; + u64 exec_clock; + u64 min_vruntime; #ifndef CONFIG_64BIT - u64 min_vruntime_copy; + u64 min_vruntime_copy; #endif - struct rb_root_cached tasks_timeline; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last, *skip; + struct sched_entity *curr; + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; #ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; + unsigned int nr_spread_over; #endif #ifdef CONFIG_SMP /* * CFS load tracking */ - struct sched_avg avg; + struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 load_last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; @@ -482,9 +518,9 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - long propagate; - long prop_runnable_sum; + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; /* * h_load = weight * f(tg) @@ -492,36 +528,38 @@ struct cfs_rq { * Where f(tg) is the recursive weight fraction assigned to * this group. */ - unsigned long h_load; - u64 last_h_load_update; - struct sched_entity *h_load_next; + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. + * This list is used during load balance. */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; - u64 runtime_expires; - s64 runtime_remaining; - - u64 throttled_clock, throttled_clock_task; - u64 throttled_clock_task_time; - int throttled, throttle_count; - struct list_head throttled_list; + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock; + u64 throttled_clock_task; + u64 throttled_clock_task_time; + int throttled; + int throttle_count; + struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void) /* Real-Time classes' related field in a runqueue: */ struct rt_rq { - struct rt_prio_array active; - unsigned int rt_nr_running; - unsigned int rr_nr_running; + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { - int curr; /* highest queued rt task prio */ + int curr; /* highest queued rt task prio */ #ifdef CONFIG_SMP - int next; /* next highest */ + int next; /* next highest */ #endif } highest_prio; #endif #ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; - struct plist_head pushable_tasks; + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ - int rt_queued; + int rt_queued; - int rt_throttled; - u64 rt_time; - u64 rt_runtime; + int rt_throttled; + u64 rt_time; + u64 rt_runtime; /* Nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; + unsigned long rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; + struct task_group *tg; #endif }; /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root_cached root; + struct rb_root_cached root; - unsigned long dl_nr_running; + unsigned long dl_nr_running; #ifdef CONFIG_SMP /* @@ -586,28 +624,28 @@ struct dl_rq { * should migrate somewhere else. */ struct { - u64 curr; - u64 next; + u64 curr; + u64 next; } earliest_dl; - unsigned long dl_nr_migratory; - int overloaded; + unsigned long dl_nr_migratory; + int overloaded; /* * Tasks on this rq that can be pushed away. They are kept in * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root_cached pushable_dl_tasks_root; + struct rb_root_cached pushable_dl_tasks_root; #else - struct dl_bw dl_bw; + struct dl_bw dl_bw; #endif /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a * task blocks */ - u64 running_bw; + u64 running_bw; /* * Utilization of the tasks "assigned" to this runqueue (including @@ -618,14 +656,14 @@ struct dl_rq { * This is needed to compute the "inactive utilization" for the * runqueue (inactive utilization = this_bw - running_bw). */ - u64 this_bw; - u64 extra_bw; + u64 this_bw; + u64 extra_bw; /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. */ - u64 bw_ratio; + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b) /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new + * fully partitioning the member CPUs from any other cpuset. Whenever a new * exclusive cpuset is created, we also create and attach a new root-domain * object. * */ struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ - bool overload; + bool overload; /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). */ - cpumask_var_t dlo_mask; - atomic_t dlo_count; - struct dl_bw dl_bw; - struct cpudl cpudl; + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. */ - struct irq_work rto_push_work; - raw_spinlock_t rto_lock; + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; /* These are only updated and read within rto_lock */ - int rto_loop; - int rto_cpu; + int rto_loop; + int rto_cpu; /* These atomics are updated outside of a lock */ - atomic_t rto_loop_next; - atomic_t rto_loop_start; + atomic_t rto_loop_next; + atomic_t rto_loop_start; #endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_var_t rto_mask; - struct cpupri cpupri; + cpumask_var_t rto_mask; + struct cpupri cpupri; - unsigned long max_cpu_capacity; + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work); */ struct rq { /* runqueue lock: */ - raw_spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ - unsigned int nr_running; + unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING - unsigned int nr_numa_running; - unsigned int nr_preferred_running; + unsigned int nr_numa_running; + unsigned int nr_preferred_running; #endif #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP - unsigned long last_load_update_tick; + unsigned long last_load_update_tick; + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; #endif /* CONFIG_SMP */ - unsigned long nohz_flags; + unsigned int nohz_tick_stopped; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_NO_HZ_FULL - unsigned long last_sched_tick; -#endif - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; + /* capture load from *all* tasks on this CPU: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; - struct list_head *tmp_alone_branch; + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -751,94 +790,98 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned long nr_uninterruptible; - struct task_struct *curr, *idle, *stop; - unsigned long next_balance; - struct mm_struct *prev_mm; + struct task_struct *curr; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; - unsigned int clock_update_flags; - u64 clock; - u64 clock_task; + unsigned int clock_update_flags; + u64 clock; + u64 clock_task; - atomic_t nr_iowait; + atomic_t nr_iowait; #ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; + struct root_domain *rd; + struct sched_domain *sd; - unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; - struct callback_head *balance_callback; + struct callback_head *balance_callback; + + unsigned char idle_balance; - unsigned char idle_balance; /* For active balancing */ - int active_balance; - int push_cpu; - struct cpu_stop_work active_balance_work; - /* cpu of this runqueue: */ - int cpu; - int online; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + /* CPU of this runqueue: */ + int cpu; + int online; struct list_head cfs_tasks; - u64 rt_avg; - u64 age_stamp; - u64 idle_stamp; - u64 avg_idle; + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; /* This is used to determine avg_idle's max value */ - u64 max_idle_balance_cost; + u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; + u64 prev_irq_time; #endif #ifdef CONFIG_PARAVIRT - u64 prev_steal_time; + u64 prev_steal_time; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; + u64 prev_steal_time_rq; #endif /* calc_load related fields */ - unsigned long calc_load_update; - long calc_load_active; + unsigned long calc_load_update; + long calc_load_active; #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP - int hrtick_csd_pending; - call_single_data_t hrtick_csd; + int hrtick_csd_pending; + call_single_data_t hrtick_csd; #endif - struct hrtimer hrtick_timer; + struct hrtimer hrtick_timer; #endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ - unsigned int yld_count; + unsigned int yld_count; /* schedule() stats */ - unsigned int sched_count; - unsigned int sched_goidle; + unsigned int sched_count; + unsigned int sched_goidle; /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; + unsigned int ttwu_count; + unsigned int ttwu_local; #endif #ifdef CONFIG_SMP - struct llist_head wake_list; + struct llist_head wake_list; #endif #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ - struct cpuidle_state *idle_state; + struct cpuidle_state *idle_state; #endif }; @@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq) * one position though, because the next rq_unpin_lock() will shift it * back. */ -#define RQCF_REQ_SKIP 0x01 -#define RQCF_ACT_SKIP 0x02 -#define RQCF_UPDATED 0x04 +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 static inline void assert_clock_updated(struct rq *rq) { @@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void); /** * highest_flag_domain - Return highest sched_domain containing flag. - * @cpu: The cpu whose highest level of sched domain is to + * @cpu: The CPU whose highest level of sched domain is to * be returned. * @flag: The flag to check for the highest sched_domain - * for the given cpu. + * for the given CPU. * - * Returns the highest sched_domain of a cpu which contains the given flag. + * Returns the highest sched_domain of a CPU which contains the given flag. */ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { @@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { - atomic_t ref; + atomic_t ref; /* * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ - unsigned long capacity; - unsigned long min_capacity; /* Min per-CPU capacity in group */ - unsigned long next_update; - int imbalance; /* XXX unrelated to capacity but shared group state */ + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ #ifdef CONFIG_SCHED_DEBUG - int id; + int id; #endif - unsigned long cpumask[0]; /* balance mask */ + unsigned long cpumask[0]; /* Balance mask */ }; struct sched_group { - struct sched_group *next; /* Must be a circular list */ - atomic_t ref; + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; - unsigned int group_weight; + unsigned int group_weight; struct sched_group_capacity *sgc; - int asym_prefer_cpu; /* cpu of highest priority in group */ + int asym_prefer_cpu; /* CPU of highest priority in group */ /* * The CPUs this group covers. @@ -1131,7 +1174,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[0]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) @@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) } /** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. + * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. + * @group: The group whose first CPU is to be returned. */ static inline unsigned int group_first_cpu(struct sched_group *group) { @@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p) return p->on_rq == TASK_ON_RQ_MIGRATING; } -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_post_lock_switch -# define finish_arch_post_lock_switch() do { } while (0) -#endif - /* * wake flags */ -#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* child wakeup after fork */ -#define WF_MIGRATED 0x4 /* internal use, task got migrated */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p) * slice expiry etc. */ -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 -extern const int sched_prio_to_weight[40]; -extern const u32 sched_prio_to_wmult[40]; +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: @@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40]; */ #define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -1422,10 +1458,10 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*yield_task) (struct rq *rq); - bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); /* * It is the responsibility of the pick_next_task() method that will @@ -1435,16 +1471,16 @@ struct sched_class { * May return RETRY_TASK when it finds a higher prio class has runnable * tasks. */ - struct task_struct * (*pick_next_task) (struct rq *rq, - struct task_struct *prev, - struct rq_flags *rf); - void (*put_prev_task) (struct rq *rq, struct task_struct *p); + struct task_struct * (*pick_next_task)(struct rq *rq, + struct task_struct *prev, + struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); - void (*task_woken) (struct rq *this_rq, struct task_struct *task); + void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); @@ -1453,31 +1489,31 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task) (struct rq *rq); - void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); - void (*task_fork) (struct task_struct *p); - void (*task_dead) (struct task_struct *p); + void (*set_curr_task)(struct rq *rq); + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + void (*task_fork)(struct task_struct *p); + void (*task_dead)(struct task_struct *p); /* * The switched_from() call is allowed to drop rq->lock, therefore we * cannot assume the switched_from/switched_to pair is serliazed by * rq->lock. They are however serialized by p->pi_lock. */ - void (*switched_from) (struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + int oldprio); - unsigned int (*get_rr_interval) (struct rq *rq, - struct task_struct *task); + unsigned int (*get_rr_interval)(struct rq *rq, + struct task_struct *task); - void (*update_curr) (struct rq *rq); + void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group) (struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p, int type); #endif }; @@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { SCHED_WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; } #else @@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -#define BW_SHIFT 20 -#define BW_UNIT (1 << BW_SHIFT) -#define RATIO_SHIFT 8 +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); /* * Tick may be needed by tasks in the runqueue depending on their policy and @@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } #else +static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } #endif @@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -static inline void rq_last_tick_reset(struct rq *rq) -{ -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = jiffies; -#endif -} - extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are - * already in proper order on entry. This favors lower cpu-ids and will - * grant the double lock to lower cpus over higher ids under contention, + * already in proper order on entry. This favors lower CPU-ids and will + * grant the double lock to lower CPUs over higher ids under contention, * regardless of entry order into the function. */ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) @@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) { if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ + /* printk() doesn't work well under rq->lock */ raw_spin_unlock(&this_rq->lock); BUG_ON(1); } @@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON -enum rq_nohz_flag_bits { - NOHZ_TICK_STOPPED, - NOHZ_BALANCE_KICK, -}; +#define NOHZ_BALANCE_KICK_BIT 0 +#define NOHZ_STATS_KICK_BIT 1 + +#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) + +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -extern void nohz_balance_exit_idle(unsigned int cpu); +extern void nohz_balance_exit_idle(struct rq *rq); #else -static inline void nohz_balance_exit_idle(unsigned int cpu) { } +static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif @@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity -#ifndef arch_scale_freq_invariant -#define arch_scale_freq_invariant() (true) -#endif -#else /* arch_scale_freq_capacity */ -#define arch_scale_freq_invariant() (false) +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif +#else +# define arch_scale_freq_invariant() false #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - static inline unsigned long cpu_util_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq) static inline unsigned long cpu_util_cfs(struct rq *rq) { - return rq->cfs.avg.util_avg; -} + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + return util; +} #endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 940b1fa..ab112cb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -1,14 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 - -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> - +/* + * /proc/schedstat implementation + */ #include "sched.h" /* - * bump this up when changing the output format or the meaning of an existing + * Current schedstat API version. + * + * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ #define SCHEDSTAT_VERSION 15 @@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v) * This itererator needs some explanation. * It returns 1 for the header position. * This means 2 is cpu 0. - * In a hotplugged system some cpus, including cpu 0, may be missing so we have - * to use cpumask_* to iterate over the cpus. + * In a hotplugged system some CPUs, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the CPUs. */ static void *schedstat_start(struct seq_file *file, loff_t *offset) { @@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) if (n < nr_cpu_ids) return (void *)(unsigned long)(n + 2); + return NULL; } static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) { (*offset)++; + return schedstat_start(file, offset); } @@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = { static int __init proc_schedstat_init(void) { proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; } subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8e7b58d..8aea199 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) #define __schedstat_inc(var) do { var++; } while (0) -#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) #define __schedstat_add(var, amt) do { var += (amt); } while (0) -#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) -#define __schedstat_set(var, val) do { var = (val); } while (0) -#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -#define schedstat_val(var) (var) -#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) - -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long long delta) -{} -#define schedstat_enabled() 0 -#define __schedstat_inc(var) do { } while (0) -#define schedstat_inc(var) do { } while (0) -#define __schedstat_add(var, amt) do { } while (0) -#define schedstat_add(var, amt) do { } while (0) -#define __schedstat_set(var, val) do { } while (0) -#define schedstat_set(var, val) do { } while (0) -#define schedstat_val(var) 0 -#define schedstat_val_or_zero(var) 0 +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) + +#else /* !CONFIG_SCHEDSTATS: */ +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO @@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) /* * We are interested in knowing how long it was from the *first* time a - * task was queued to the time that it finally hit a cpu, we call this routine - * from dequeue_task() to account for possible rq->clock skew across cpus. The - * delta taken on each cpu would annul the skew. + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. */ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { @@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) } /* - * Called when a task finally hits the cpu. We can now calculate how + * Called when a task finally hits the CPU. We can now calculate how * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ @@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { - if (unlikely(sched_info_on())) + if (unlikely(sched_info_on())) { if (!t->sched_info.last_queued) t->sched_info.last_queued = rq_clock(rq); + } } /* @@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) */ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = rq_clock(rq) - - t->sched_info.last_arrival; + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; rq_sched_info_depart(rq, delta); @@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* - * prev now departs the cpu. It's not interesting to record + * prev now departs the CPU. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ @@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq, if (next != rq->idle) sched_info_arrive(rq, next); } + static inline void -sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) __sched_info_switch(rq, prev, next); } -#else -#define sched_info_queued(rq, t) do { } while (0) -#define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(rq, t) do { } while (0) -#define sched_info_depart(rq, t) do { } while (0) -#define sched_info_arrive(rq, next) do { } while (0) -#define sched_info_switch(rq, t, next) do { } while (0) + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_queued(rq, t) do { } while (0) +# define sched_info_reset_dequeued(t) do { } while (0) +# define sched_info_dequeued(rq, t) do { } while (0) +# define sched_info_depart(rq, t) do { } while (0) +# define sched_info_arrive(rq, next) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 210b1f2..c183b79 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - /* * stop-task scheduling class. * @@ -9,6 +7,7 @@ * * See kernel/stop_machine.c */ +#include "sched.h" #ifdef CONFIG_SMP static int @@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) cgroup_account_cputime(curr, delta_exec); } +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 9ff1555..b6fb2c3 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/sched/signal.h> -#include <linux/swait.h> +/* + * <linux/swait.h> (simple wait queues ) implementation: + */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 519b024..64cc564 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,10 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include <linux/sched.h> -#include <linux/mutex.h> -#include <linux/sched/isolation.h> - #include "sched.h" DEFINE_MUTEX(sched_domains_mutex); @@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); return -1; } @@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_span(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); @@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); + printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); return 0; } @@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg) * are not. * * This leads to a few particularly weird cases where the sched_domain's are - * not of the same number for each cpu. Consider: + * not of the same number for each CPU. Consider: * * NUMA-2 0-3 0-3 * groups: {0-2},{1-3} {1-3},{0-2} @@ -780,7 +772,7 @@ fail: * ^ ^ ^ ^ * `-' `-' * - * The sched_domains are per-cpu and have a two way link (parent & child) and + * The sched_domains are per-CPU and have a two way link (parent & child) and * denote the ever growing mask of CPUs belonging to that level of topology. * * Each sched_domain has a circular (double) linked list of sched_group's, each @@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd; + return sa_rootdomain; } @@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) } #ifdef CONFIG_NUMA -static int sched_domains_numa_levels; enum numa_topology_type sched_numa_topology_type; -static int *sched_domains_numa_distance; -int sched_max_numa_distance; -static struct cpumask ***sched_domains_numa_masks; -static int sched_domains_curr_level; + +static int sched_domains_numa_levels; +static int sched_domains_curr_level; + +int sched_max_numa_distance; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; #endif /* @@ -1074,11 +1069,11 @@ static int sched_domains_curr_level; * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUCAPACITY | \ + (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ - SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * @@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); #endif - /* Fixup, ensure @sd has at least @child cpus. */ + /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), sched_domain_span(child)); @@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); + return ret; } @@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, return 1; tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, new ? (new + idx_new) : &tmp, sizeof(struct sched_domain_attr)); @@ -1929,4 +1926,3 @@ match2: mutex_unlock(&sched_domains_mutex); } - diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 929ecb7..928be52 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -3,14 +3,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include <linux/init.h> -#include <linux/export.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/mm.h> -#include <linux/wait.h> -#include <linux/hash.h> -#include <linux/kthread.h> +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { @@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, break; } } + return nr_exclusive; } @@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) spin_unlock(&wq->lock); schedule(); spin_lock(&wq->lock); + return 0; } EXPORT_SYMBOL(do_wait_intr); @@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) spin_unlock_irq(&wq->lock); schedule(); spin_lock_irq(&wq->lock); + return 0; } EXPORT_SYMBOL(do_wait_intr_irq); @@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i if (ret) list_del_init(&wq_entry->entry); + return ret; } EXPORT_SYMBOL(autoremove_wake_function); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 84cb3ac..c67c6d2 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,10 +1,7 @@ /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include <linux/wait_bit.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/hash.h> +#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) @@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync wait_bit->key.bit_nr != key->bit_nr || test_bit(key->bit_nr, key->flags)) return 0; - else - return autoremove_wake_function(wq_entry, mode, sync, key); + + return autoremove_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(wake_bit_function); @@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout( DEFINE_WAIT_BIT(wq_entry, word, bit); wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); @@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq_head)) __wake_up(wq_head, TASK_NORMAL, 1, &key); } @@ -148,108 +149,55 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +wait_queue_head_t *__var_waitqueue(void *p) { - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); + return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS); } +EXPORT_SYMBOL(__var_waitqueue); -static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, - void *arg) +static int +var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - atomic_t *val = key->flags; + struct wait_bit_queue_entry *wbq_entry = + container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) + if (wbq_entry->key.flags != key->flags || + wbq_entry->key.bit_nr != key->bit_nr) return 0; - return autoremove_wake_function(wq_entry, mode, sync, key); -} -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - wait_atomic_t_action_f action, unsigned int mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); - val = wbq_entry->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val, mode); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &wbq_entry->wq_entry); - return ret; + return autoremove_wake_function(wq_entry, mode, sync, key); } -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue_entry name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wq_entry = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .entry = \ - LIST_HEAD_INIT((name).wq_entry.entry), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, - wait_atomic_t_action_f action, - unsigned int mode) +void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags) { - struct wait_queue_head *wq_head = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wq_entry, p); - - return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); + *wbq_entry = (struct wait_bit_queue_entry){ + .key = { + .flags = (var), + .bit_nr = -1, + }, + .wq_entry = { + .private = current, + .func = var_wake_function, + .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), + }, + }; } -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); +EXPORT_SYMBOL(init_wait_var_entry); -__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) +void wake_up_var(void *var) { - schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; + __wake_up_bit(__var_waitqueue(var), var, -1); } -EXPORT_SYMBOL(atomic_t_wait); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); +EXPORT_SYMBOL(wake_up_var); __sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL(bit_wait); @@ -259,6 +207,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) io_schedule(); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL(bit_wait_io); @@ -266,11 +215,13 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); @@ -278,11 +229,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/signal.c b/kernel/signal.c index c6e4c83..f044666 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3573,9 +3573,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(sigaltstack, - const compat_stack_t __user *, uss_ptr, - compat_stack_t __user *, uoss_ptr) +static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, + compat_stack_t __user *uoss_ptr) { stack_t uss, uoss; int ret; @@ -3602,9 +3601,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, return ret; } +COMPAT_SYSCALL_DEFINE2(sigaltstack, + const compat_stack_t __user *, uss_ptr, + compat_stack_t __user *, uoss_ptr) +{ + return do_compat_sigaltstack(uss_ptr, uoss_ptr); +} + int compat_restore_altstack(const compat_stack_t __user *uss) { - int err = compat_sys_sigaltstack(uss, NULL); + int err = do_compat_sigaltstack(uss, NULL); /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } @@ -3629,11 +3635,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) /** * sys_sigpending - examine pending signals - * @set: where mask of pending signal is returned + * @uset: where mask of pending signal is returned */ -SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) +SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { - return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); + sigset_t set; + int err; + + if (sizeof(old_sigset_t) > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); + if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) + err = -EFAULT; + return err; } #ifdef CONFIG_COMPAT diff --git a/kernel/sys.c b/kernel/sys.c index f2289de..ad69218 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -69,6 +69,8 @@ #include <asm/io.h> #include <asm/unistd.h> +#include "uid16.h" + #ifndef SET_UNALIGN_CTL # define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif @@ -340,7 +342,7 @@ out_unlock: * operations (as far as semantic preservation is concerned). */ #ifdef CONFIG_MULTIUSER -SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +long __sys_setregid(gid_t rgid, gid_t egid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -392,12 +394,17 @@ error: return retval; } +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +{ + return __sys_setregid(rgid, egid); +} + /* * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ -SYSCALL_DEFINE1(setgid, gid_t, gid) +long __sys_setgid(gid_t gid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -429,6 +436,11 @@ error: return retval; } +SYSCALL_DEFINE1(setgid, gid_t, gid) +{ + return __sys_setgid(gid); +} + /* * change the user struct in a credentials set to match the new UID */ @@ -473,7 +485,7 @@ static int set_user(struct cred *new) * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ -SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +long __sys_setreuid(uid_t ruid, uid_t euid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -533,6 +545,11 @@ error: return retval; } +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +{ + return __sys_setreuid(ruid, euid); +} + /* * setuid() is implemented like SysV with SAVED_IDS * @@ -544,7 +561,7 @@ error: * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ -SYSCALL_DEFINE1(setuid, uid_t, uid) +long __sys_setuid(uid_t uid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -586,12 +603,17 @@ error: return retval; } +SYSCALL_DEFINE1(setuid, uid_t, uid) +{ + return __sys_setuid(uid); +} + /* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ -SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -656,6 +678,11 @@ error: return retval; } +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +{ + return __sys_setresuid(ruid, euid, suid); +} + SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); @@ -678,7 +705,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ /* * Same as above, but for rgid, egid, sgid. */ -SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -730,6 +757,11 @@ error: return retval; } +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +{ + return __sys_setresgid(rgid, egid, sgid); +} + SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); @@ -757,7 +789,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ -SYSCALL_DEFINE1(setfsuid, uid_t, uid) +long __sys_setfsuid(uid_t uid) { const struct cred *old; struct cred *new; @@ -793,10 +825,15 @@ change_okay: return old_fsuid; } +SYSCALL_DEFINE1(setfsuid, uid_t, uid) +{ + return __sys_setfsuid(uid); +} + /* * Samma pÃ¥ svenska.. */ -SYSCALL_DEFINE1(setfsgid, gid_t, gid) +long __sys_setfsgid(gid_t gid) { const struct cred *old; struct cred *new; @@ -830,6 +867,11 @@ change_okay: commit_creds(new); return old_fsgid; } + +SYSCALL_DEFINE1(setfsgid, gid_t, gid) +{ + return __sys_setfsgid(gid); +} #endif /* CONFIG_MULTIUSER */ /** @@ -1027,7 +1069,7 @@ out: return err; } -SYSCALL_DEFINE1(getpgid, pid_t, pid) +static int do_getpgid(pid_t pid) { struct task_struct *p; struct pid *grp; @@ -1055,11 +1097,16 @@ out: return retval; } +SYSCALL_DEFINE1(getpgid, pid_t, pid) +{ + return do_getpgid(pid); +} + #ifdef __ARCH_WANT_SYS_GETPGRP SYSCALL_DEFINE0(getpgrp) { - return sys_getpgid(0); + return do_getpgid(0); } #endif @@ -1103,7 +1150,7 @@ static void set_special_pids(struct pid *pid) change_pid(curr, PIDTYPE_PGID, pid); } -SYSCALL_DEFINE0(setsid) +int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); @@ -1136,6 +1183,11 @@ out: return err; } +SYSCALL_DEFINE0(setsid) +{ + return ksys_setsid(); +} + DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index b518976..6cafc00 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -17,245 +17,406 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_quotactl); -cond_syscall(sys32_quotactl); -cond_syscall(sys_acct); -cond_syscall(sys_lookup_dcookie); -cond_syscall(compat_sys_lookup_dcookie); -cond_syscall(sys_swapon); -cond_syscall(sys_swapoff); -cond_syscall(sys_kexec_load); -cond_syscall(compat_sys_kexec_load); -cond_syscall(sys_kexec_file_load); -cond_syscall(sys_init_module); -cond_syscall(sys_finit_module); -cond_syscall(sys_delete_module); -cond_syscall(sys_socketpair); -cond_syscall(sys_bind); -cond_syscall(sys_listen); -cond_syscall(sys_accept); -cond_syscall(sys_accept4); -cond_syscall(sys_connect); -cond_syscall(sys_getsockname); -cond_syscall(sys_getpeername); -cond_syscall(sys_sendto); -cond_syscall(sys_send); -cond_syscall(sys_recvfrom); -cond_syscall(sys_recv); -cond_syscall(sys_socket); -cond_syscall(sys_setsockopt); -cond_syscall(compat_sys_setsockopt); -cond_syscall(sys_getsockopt); -cond_syscall(compat_sys_getsockopt); -cond_syscall(sys_shutdown); -cond_syscall(sys_sendmsg); -cond_syscall(sys_sendmmsg); -cond_syscall(compat_sys_sendmsg); -cond_syscall(compat_sys_sendmmsg); -cond_syscall(sys_recvmsg); -cond_syscall(sys_recvmmsg); -cond_syscall(compat_sys_recvmsg); -cond_syscall(compat_sys_recv); -cond_syscall(compat_sys_recvfrom); -cond_syscall(compat_sys_recvmmsg); -cond_syscall(sys_socketcall); -cond_syscall(sys_futex); -cond_syscall(compat_sys_futex); -cond_syscall(sys_set_robust_list); -cond_syscall(compat_sys_set_robust_list); -cond_syscall(sys_get_robust_list); -cond_syscall(compat_sys_get_robust_list); -cond_syscall(sys_epoll_create); -cond_syscall(sys_epoll_create1); -cond_syscall(sys_epoll_ctl); -cond_syscall(sys_epoll_wait); -cond_syscall(sys_epoll_pwait); -cond_syscall(compat_sys_epoll_pwait); -cond_syscall(sys_semget); -cond_syscall(sys_semop); -cond_syscall(sys_semtimedop); -cond_syscall(compat_sys_semtimedop); -cond_syscall(sys_semctl); -cond_syscall(compat_sys_semctl); -cond_syscall(sys_msgget); -cond_syscall(sys_msgsnd); -cond_syscall(compat_sys_msgsnd); -cond_syscall(sys_msgrcv); -cond_syscall(compat_sys_msgrcv); -cond_syscall(sys_msgctl); -cond_syscall(compat_sys_msgctl); -cond_syscall(sys_shmget); -cond_syscall(sys_shmat); -cond_syscall(compat_sys_shmat); -cond_syscall(sys_shmdt); -cond_syscall(sys_shmctl); -cond_syscall(compat_sys_shmctl); -cond_syscall(sys_mq_open); -cond_syscall(sys_mq_unlink); -cond_syscall(sys_mq_timedsend); -cond_syscall(sys_mq_timedreceive); -cond_syscall(sys_mq_notify); -cond_syscall(sys_mq_getsetattr); -cond_syscall(compat_sys_mq_open); -cond_syscall(compat_sys_mq_timedsend); -cond_syscall(compat_sys_mq_timedreceive); -cond_syscall(compat_sys_mq_notify); -cond_syscall(compat_sys_mq_getsetattr); -cond_syscall(sys_mbind); -cond_syscall(sys_get_mempolicy); -cond_syscall(sys_set_mempolicy); -cond_syscall(compat_sys_mbind); -cond_syscall(compat_sys_get_mempolicy); -cond_syscall(compat_sys_set_mempolicy); -cond_syscall(sys_add_key); -cond_syscall(sys_request_key); -cond_syscall(sys_keyctl); -cond_syscall(compat_sys_keyctl); -cond_syscall(compat_sys_socketcall); -cond_syscall(sys_inotify_init); -cond_syscall(sys_inotify_init1); -cond_syscall(sys_inotify_add_watch); -cond_syscall(sys_inotify_rm_watch); -cond_syscall(sys_migrate_pages); -cond_syscall(sys_move_pages); -cond_syscall(sys_chown16); -cond_syscall(sys_fchown16); -cond_syscall(sys_getegid16); -cond_syscall(sys_geteuid16); -cond_syscall(sys_getgid16); -cond_syscall(sys_getgroups16); -cond_syscall(sys_getresgid16); -cond_syscall(sys_getresuid16); -cond_syscall(sys_getuid16); -cond_syscall(sys_lchown16); -cond_syscall(sys_setfsgid16); -cond_syscall(sys_setfsuid16); -cond_syscall(sys_setgid16); -cond_syscall(sys_setgroups16); -cond_syscall(sys_setregid16); -cond_syscall(sys_setresgid16); -cond_syscall(sys_setresuid16); -cond_syscall(sys_setreuid16); -cond_syscall(sys_setuid16); -cond_syscall(sys_sgetmask); -cond_syscall(sys_ssetmask); -cond_syscall(sys_vm86old); -cond_syscall(sys_vm86); -cond_syscall(sys_modify_ldt); -cond_syscall(sys_ipc); -cond_syscall(compat_sys_ipc); -cond_syscall(compat_sys_sysctl); -cond_syscall(sys_flock); -cond_syscall(sys_io_setup); -cond_syscall(sys_io_destroy); -cond_syscall(sys_io_submit); -cond_syscall(sys_io_cancel); -cond_syscall(sys_io_getevents); -cond_syscall(compat_sys_io_setup); -cond_syscall(compat_sys_io_submit); -cond_syscall(compat_sys_io_getevents); -cond_syscall(sys_sysfs); -cond_syscall(sys_syslog); -cond_syscall(sys_process_vm_readv); -cond_syscall(sys_process_vm_writev); -cond_syscall(compat_sys_process_vm_readv); -cond_syscall(compat_sys_process_vm_writev); -cond_syscall(sys_uselib); -cond_syscall(sys_fadvise64); -cond_syscall(sys_fadvise64_64); -cond_syscall(sys_madvise); -cond_syscall(sys_setuid); -cond_syscall(sys_setregid); -cond_syscall(sys_setgid); -cond_syscall(sys_setreuid); -cond_syscall(sys_setresuid); -cond_syscall(sys_getresuid); -cond_syscall(sys_setresgid); -cond_syscall(sys_getresgid); -cond_syscall(sys_setgroups); -cond_syscall(sys_getgroups); -cond_syscall(sys_setfsuid); -cond_syscall(sys_setfsgid); -cond_syscall(sys_capget); -cond_syscall(sys_capset); -cond_syscall(sys_copy_file_range); - -/* arch-specific weak syscall entries */ -cond_syscall(sys_pciconfig_read); -cond_syscall(sys_pciconfig_write); -cond_syscall(sys_pciconfig_iobase); -cond_syscall(compat_sys_s390_ipc); -cond_syscall(ppc_rtas); -cond_syscall(sys_spu_run); -cond_syscall(sys_spu_create); -cond_syscall(sys_subpage_prot); -cond_syscall(sys_s390_pci_mmio_read); -cond_syscall(sys_s390_pci_mmio_write); - -/* mmu depending weak syscall entries */ -cond_syscall(sys_mprotect); -cond_syscall(sys_msync); -cond_syscall(sys_mlock); -cond_syscall(sys_munlock); -cond_syscall(sys_mlockall); -cond_syscall(sys_munlockall); -cond_syscall(sys_mlock2); -cond_syscall(sys_mincore); -cond_syscall(sys_madvise); -cond_syscall(sys_mremap); -cond_syscall(sys_remap_file_pages); -cond_syscall(compat_sys_move_pages); -cond_syscall(compat_sys_migrate_pages); - -/* block-layer dependent */ -cond_syscall(sys_bdflush); -cond_syscall(sys_ioprio_set); -cond_syscall(sys_ioprio_get); - -/* New file descriptors */ -cond_syscall(sys_signalfd); -cond_syscall(sys_signalfd4); -cond_syscall(compat_sys_signalfd); -cond_syscall(compat_sys_signalfd4); -cond_syscall(sys_timerfd_create); -cond_syscall(sys_timerfd_settime); -cond_syscall(sys_timerfd_gettime); -cond_syscall(compat_sys_timerfd_settime); -cond_syscall(compat_sys_timerfd_gettime); -cond_syscall(sys_eventfd); -cond_syscall(sys_eventfd2); -cond_syscall(sys_memfd_create); -cond_syscall(sys_userfaultfd); - -/* performance counters: */ -cond_syscall(sys_perf_event_open); - -/* fanotify! */ -cond_syscall(sys_fanotify_init); -cond_syscall(sys_fanotify_mark); -cond_syscall(compat_sys_fanotify_mark); +#define COND_SYSCALL(name) cond_syscall(sys_##name) +#define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) + +/* + * This list is kept in the same order as include/uapi/asm-generic/unistd.h. + * Architecture specific entries go below, followed by deprecated or obsolete + * system calls. + */ + +COND_SYSCALL(io_setup); +COND_SYSCALL_COMPAT(io_setup); +COND_SYSCALL(io_destroy); +COND_SYSCALL(io_submit); +COND_SYSCALL_COMPAT(io_submit); +COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents); +COND_SYSCALL_COMPAT(io_getevents); + +/* fs/xattr.c */ + +/* fs/dcache.c */ + +/* fs/cookies.c */ +COND_SYSCALL(lookup_dcookie); +COND_SYSCALL_COMPAT(lookup_dcookie); + +/* fs/eventfd.c */ +COND_SYSCALL(eventfd2); + +/* fs/eventfd.c */ +COND_SYSCALL(epoll_create1); +COND_SYSCALL(epoll_ctl); +COND_SYSCALL(epoll_pwait); +COND_SYSCALL_COMPAT(epoll_pwait); + +/* fs/fcntl.c */ + +/* fs/inotify_user.c */ +COND_SYSCALL(inotify_init1); +COND_SYSCALL(inotify_add_watch); +COND_SYSCALL(inotify_rm_watch); + +/* fs/ioctl.c */ + +/* fs/ioprio.c */ +COND_SYSCALL(ioprio_set); +COND_SYSCALL(ioprio_get); + +/* fs/locks.c */ +COND_SYSCALL(flock); + +/* fs/namei.c */ + +/* fs/namespace.c */ + +/* fs/nfsctl.c */ + +/* fs/open.c */ + +/* fs/pipe.c */ + +/* fs/quota.c */ +COND_SYSCALL(quotactl); + +/* fs/readdir.c */ + +/* fs/read_write.c */ + +/* fs/sendfile.c */ + +/* fs/select.c */ + +/* fs/signalfd.c */ +COND_SYSCALL(signalfd4); +COND_SYSCALL_COMPAT(signalfd4); + +/* fs/splice.c */ + +/* fs/stat.c */ + +/* fs/sync.c */ + +/* fs/timerfd.c */ +COND_SYSCALL(timerfd_create); +COND_SYSCALL(timerfd_settime); +COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_gettime); +COND_SYSCALL_COMPAT(timerfd_gettime); + +/* fs/utimes.c */ + +/* kernel/acct.c */ +COND_SYSCALL(acct); + +/* kernel/capability.c */ +COND_SYSCALL(capget); +COND_SYSCALL(capset); + +/* kernel/exec_domain.c */ + +/* kernel/exit.c */ + +/* kernel/fork.c */ + +/* kernel/futex.c */ +COND_SYSCALL(futex); +COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(set_robust_list); +COND_SYSCALL_COMPAT(set_robust_list); +COND_SYSCALL(get_robust_list); +COND_SYSCALL_COMPAT(get_robust_list); + +/* kernel/hrtimer.c */ + +/* kernel/itimer.c */ + +/* kernel/kexec.c */ +COND_SYSCALL(kexec_load); +COND_SYSCALL_COMPAT(kexec_load); + +/* kernel/module.c */ +COND_SYSCALL(init_module); +COND_SYSCALL(delete_module); + +/* kernel/posix-timers.c */ + +/* kernel/printk.c */ +COND_SYSCALL(syslog); + +/* kernel/ptrace.c */ + +/* kernel/sched/core.c */ + +/* kernel/signal.c */ + +/* kernel/sys.c */ +COND_SYSCALL(setregid); +COND_SYSCALL(setgid); +COND_SYSCALL(setreuid); +COND_SYSCALL(setuid); +COND_SYSCALL(setresuid); +COND_SYSCALL(getresuid); +COND_SYSCALL(setresgid); +COND_SYSCALL(getresgid); +COND_SYSCALL(setfsuid); +COND_SYSCALL(setfsgid); +COND_SYSCALL(setgroups); +COND_SYSCALL(getgroups); + +/* kernel/time.c */ + +/* kernel/timer.c */ + +/* ipc/mqueue.c */ +COND_SYSCALL(mq_open); +COND_SYSCALL_COMPAT(mq_open); +COND_SYSCALL(mq_unlink); +COND_SYSCALL(mq_timedsend); +COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedreceive); +COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_notify); +COND_SYSCALL_COMPAT(mq_notify); +COND_SYSCALL(mq_getsetattr); +COND_SYSCALL_COMPAT(mq_getsetattr); + +/* ipc/msg.c */ +COND_SYSCALL(msgget); +COND_SYSCALL(msgctl); +COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL(msgrcv); +COND_SYSCALL_COMPAT(msgrcv); +COND_SYSCALL(msgsnd); +COND_SYSCALL_COMPAT(msgsnd); + +/* ipc/sem.c */ +COND_SYSCALL(semget); +COND_SYSCALL(semctl); +COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL(semtimedop); +COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semop); + +/* ipc/shm.c */ +COND_SYSCALL(shmget); +COND_SYSCALL(shmctl); +COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL(shmat); +COND_SYSCALL_COMPAT(shmat); +COND_SYSCALL(shmdt); + +/* net/socket.c */ +COND_SYSCALL(socket); +COND_SYSCALL(socketpair); +COND_SYSCALL(bind); +COND_SYSCALL(listen); +COND_SYSCALL(accept); +COND_SYSCALL(connect); +COND_SYSCALL(getsockname); +COND_SYSCALL(getpeername); +COND_SYSCALL(setsockopt); +COND_SYSCALL_COMPAT(setsockopt); +COND_SYSCALL(getsockopt); +COND_SYSCALL_COMPAT(getsockopt); +COND_SYSCALL(sendto); +COND_SYSCALL(shutdown); +COND_SYSCALL(recvfrom); +COND_SYSCALL_COMPAT(recvfrom); +COND_SYSCALL(sendmsg); +COND_SYSCALL_COMPAT(sendmsg); +COND_SYSCALL(recvmsg); +COND_SYSCALL_COMPAT(recvmsg); + +/* mm/filemap.c */ + +/* mm/nommu.c, also with MMU */ +COND_SYSCALL(mremap); + +/* security/keys/keyctl.c */ +COND_SYSCALL(add_key); +COND_SYSCALL(request_key); +COND_SYSCALL(keyctl); +COND_SYSCALL_COMPAT(keyctl); + +/* arch/example/kernel/sys_example.c */ + +/* mm/fadvise.c */ +COND_SYSCALL(fadvise64_64); + +/* mm/, CONFIG_MMU only */ +COND_SYSCALL(swapon); +COND_SYSCALL(swapoff); +COND_SYSCALL(mprotect); +COND_SYSCALL(msync); +COND_SYSCALL(mlock); +COND_SYSCALL(munlock); +COND_SYSCALL(mlockall); +COND_SYSCALL(munlockall); +COND_SYSCALL(mincore); +COND_SYSCALL(madvise); +COND_SYSCALL(remap_file_pages); +COND_SYSCALL(mbind); +COND_SYSCALL_COMPAT(mbind); +COND_SYSCALL(get_mempolicy); +COND_SYSCALL_COMPAT(get_mempolicy); +COND_SYSCALL(set_mempolicy); +COND_SYSCALL_COMPAT(set_mempolicy); +COND_SYSCALL(migrate_pages); +COND_SYSCALL_COMPAT(migrate_pages); +COND_SYSCALL(move_pages); +COND_SYSCALL_COMPAT(move_pages); + +COND_SYSCALL(perf_event_open); +COND_SYSCALL(accept4); +COND_SYSCALL(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg); + +/* + * Architecture specific syscalls: see further below + */ + +/* fanotify */ +COND_SYSCALL(fanotify_init); +COND_SYSCALL(fanotify_mark); /* open by handle */ -cond_syscall(sys_name_to_handle_at); -cond_syscall(sys_open_by_handle_at); -cond_syscall(compat_sys_open_by_handle_at); +COND_SYSCALL(name_to_handle_at); +COND_SYSCALL(open_by_handle_at); +COND_SYSCALL_COMPAT(open_by_handle_at); + +COND_SYSCALL(sendmmsg); +COND_SYSCALL_COMPAT(sendmmsg); +COND_SYSCALL(process_vm_readv); +COND_SYSCALL_COMPAT(process_vm_readv); +COND_SYSCALL(process_vm_writev); +COND_SYSCALL_COMPAT(process_vm_writev); /* compare kernel pointers */ -cond_syscall(sys_kcmp); +COND_SYSCALL(kcmp); + +COND_SYSCALL(finit_module); /* operate on Secure Computing state */ -cond_syscall(sys_seccomp); +COND_SYSCALL(seccomp); + +COND_SYSCALL(memfd_create); /* access BPF programs and maps */ -cond_syscall(sys_bpf); +COND_SYSCALL(bpf); /* execveat */ -cond_syscall(sys_execveat); +COND_SYSCALL(execveat); + +COND_SYSCALL(userfaultfd); /* membarrier */ -cond_syscall(sys_membarrier); +COND_SYSCALL(membarrier); + +COND_SYSCALL(mlock2); + +COND_SYSCALL(copy_file_range); /* memory protection keys */ -cond_syscall(sys_pkey_mprotect); -cond_syscall(sys_pkey_alloc); -cond_syscall(sys_pkey_free); +COND_SYSCALL(pkey_mprotect); +COND_SYSCALL(pkey_alloc); +COND_SYSCALL(pkey_free); + + +/* + * Architecture specific weak syscall entries. + */ + +/* pciconfig: alpha, arm, arm64, ia64, sparc */ +COND_SYSCALL(pciconfig_read); +COND_SYSCALL(pciconfig_write); +COND_SYSCALL(pciconfig_iobase); + +/* sys_socketcall: arm, mips, x86, ... */ +COND_SYSCALL(socketcall); +COND_SYSCALL_COMPAT(socketcall); + +/* compat syscalls for arm64, x86, ... */ +COND_SYSCALL_COMPAT(sysctl); +COND_SYSCALL_COMPAT(fanotify_mark); + +/* x86 */ +COND_SYSCALL(vm86old); +COND_SYSCALL(modify_ldt); +COND_SYSCALL_COMPAT(quotactl32); +COND_SYSCALL(vm86); +COND_SYSCALL(kexec_file_load); + +/* s390 */ +COND_SYSCALL(s390_pci_mmio_read); +COND_SYSCALL(s390_pci_mmio_write); +COND_SYSCALL_COMPAT(s390_ipc); + +/* powerpc */ +cond_syscall(ppc_rtas); +COND_SYSCALL(spu_run); +COND_SYSCALL(spu_create); +COND_SYSCALL(subpage_prot); + + +/* + * Deprecated system calls which are still defined in + * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch + */ + +/* __ARCH_WANT_SYSCALL_NO_FLAGS */ +COND_SYSCALL(epoll_create); +COND_SYSCALL(inotify_init); +COND_SYSCALL(eventfd); +COND_SYSCALL(signalfd); +COND_SYSCALL_COMPAT(signalfd); + +/* __ARCH_WANT_SYSCALL_OFF_T */ +COND_SYSCALL(fadvise64); + +/* __ARCH_WANT_SYSCALL_DEPRECATED */ +COND_SYSCALL(epoll_wait); +COND_SYSCALL(recv); +COND_SYSCALL_COMPAT(recv); +COND_SYSCALL(send); +COND_SYSCALL(bdflush); +COND_SYSCALL(uselib); + + +/* + * The syscalls below are not found in include/uapi/asm-generic/unistd.h + */ + +/* obsolete: SGETMASK_SYSCALL */ +COND_SYSCALL(sgetmask); +COND_SYSCALL(ssetmask); + +/* obsolete: SYSFS_SYSCALL */ +COND_SYSCALL(sysfs); + +/* obsolete: __ARCH_WANT_SYS_IPC */ +COND_SYSCALL(ipc); +COND_SYSCALL_COMPAT(ipc); + +/* obsolete: UID16 */ +COND_SYSCALL(chown16); +COND_SYSCALL(fchown16); +COND_SYSCALL(getegid16); +COND_SYSCALL(geteuid16); +COND_SYSCALL(getgid16); +COND_SYSCALL(getgroups16); +COND_SYSCALL(getresgid16); +COND_SYSCALL(getresuid16); +COND_SYSCALL(getuid16); +COND_SYSCALL(lchown16); +COND_SYSCALL(setfsgid16); +COND_SYSCALL(setfsuid16); +COND_SYSCALL(setgid16); +COND_SYSCALL(setgroups16); +COND_SYSCALL(setregid16); +COND_SYSCALL(setresgid16); +COND_SYSCALL(setresuid16); +COND_SYSCALL(setreuid16); +COND_SYSCALL(setuid16); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6b5f19..78eabc4 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -113,16 +113,6 @@ config NO_HZ_FULL endchoice -config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default (except CPU 0)" - depends on NO_HZ_FULL - help - If the user doesn't pass the nohz_full boot option to - define the range of full dynticks CPUs, consider that all - CPUs in the system are full dynticks by default. - Note the boot CPU will still be kept outside the range to - handle the timekeeping duty. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733..5d4a034 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu) return 0; } -static int tick_nohz_init_all(void) -{ - int err = -1; - -#ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - err = 0; - cpumask_setall(tick_nohz_full_mask); - tick_nohz_full_running = true; -#endif - return err; -} - void __init tick_nohz_init(void) { int cpu, ret; - if (!tick_nohz_full_running) { - if (tick_nohz_init_all() < 0) - return; - } + if (!tick_nohz_full_running) + return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe @@ -481,11 +463,18 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); -int tick_nohz_tick_stopped(void) +bool tick_nohz_tick_stopped(void) { return __this_cpu_read(tick_cpu_sched.tick_stopped); } +bool tick_nohz_tick_stopped_cpu(int cpu) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + return ts->tick_stopped; +} + /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @@ -741,12 +730,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta = KTIME_MAX; } -#ifdef CONFIG_NO_HZ_FULL - /* Limit the tick delta to the maximum scheduler deferment */ - if (!ts->inidle) - delta = min(delta, scheduler_tick_max_deferment()); -#endif - /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; @@ -953,13 +936,6 @@ void tick_nohz_idle_enter(void) struct tick_sched *ts; lockdep_assert_irqs_enabled(); - /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ - set_cpu_sd_state_idle(); local_irq_disable(); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 55d6dff..2c41650 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kprobes.h> #include "trace.h" +#include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; @@ -237,6 +238,107 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_unlock(&event_mutex); } +#ifdef CONFIG_KPROBE_EVENTS +int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *func = NULL; + struct trace_event_call *tp_event; + + if (p_event->attr.kprobe_func) { + func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + return -ENOMEM; + ret = strncpy_from_user( + func, u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (ret < 0) + goto out; + + if (func[0] == '\0') { + kfree(func); + func = NULL; + } + } + + tp_event = create_local_trace_kprobe( + func, (void *)(unsigned long)(p_event->attr.kprobe_addr), + p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_kprobe(tp_event); +out: + kfree(func); + return ret; +} + +void perf_kprobe_destroy(struct perf_event *p_event) +{ + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + + destroy_local_trace_kprobe(p_event->tp_event); +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *path = NULL; + struct trace_event_call *tp_event; + + if (!p_event->attr.uprobe_path) + return -EINVAL; + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + ret = strncpy_from_user( + path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret < 0) + goto out; + if (path[0] == '\0') { + ret = -EINVAL; + goto out; + } + + tp_event = create_local_trace_uprobe( + path, p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + /* + * local trace_uprobe need to hold event_mutex to call + * uprobe_buffer_enable() and uprobe_buffer_disable(). + * event_mutex is not required for local trace_kprobes. + */ + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_uprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(path); + return ret; +} + +void perf_uprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); + destroy_local_trace_uprobe(p_event->tp_event); +} +#endif /* CONFIG_UPROBE_EVENTS */ + int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ae4147e..1cd3fb4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -462,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) disable_kprobe(&tk->rp.kp); wait = 1; } + + /* + * if tk is not added to any list, it must be a local trace_kprobe + * created with perf_event_open. We don't need to wait for these + * trace_kprobes + */ + if (list_empty(&tk->list)) + wait = 0; out: if (wait) { /* @@ -1358,12 +1366,9 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_kprobe_event(struct trace_kprobe *tk) +static inline void init_trace_event_call(struct trace_kprobe *tk, + struct trace_event_call *call) { - struct trace_event_call *call = &tk->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; @@ -1372,6 +1377,19 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } + + call->flags = TRACE_EVENT_FL_KPROBE; + call->class->reg = kprobe_register; + call->data = tk; +} + +static int register_kprobe_event(struct trace_kprobe *tk) +{ + struct trace_event_call *call = &tk->tp.call; + int ret = 0; + + init_trace_event_call(tk, call); + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1379,9 +1397,6 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = TRACE_EVENT_FL_KPROBE; - call->class->reg = kprobe_register; - call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", @@ -1403,6 +1418,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } +#ifdef CONFIG_PERF_EVENTS +/* create a trace_kprobe, but don't add it to global lists */ +struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return) +{ + struct trace_kprobe *tk; + int ret; + char *event; + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name here. + */ + event = func ? func : "DUMMY_EVENT"; + + tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, + offs, 0 /* maxactive */, 0 /* nargs */, + is_return); + + if (IS_ERR(tk)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tk)); + return ERR_CAST(tk); + } + + init_trace_event_call(tk, &tk->tp.call); + + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ret = -ENOMEM; + goto error; + } + + ret = __register_trace_kprobe(tk); + if (ret < 0) + goto error; + + return &tk->tp.call; +error: + free_trace_kprobe(tk); + return ERR_PTR(ret); +} + +void destroy_local_trace_kprobe(struct trace_event_call *event_call) +{ + struct trace_kprobe *tk; + + tk = container_of(event_call, struct trace_kprobe, tp.call); + + if (trace_probe_is_enabled(&tk->tp)) { + WARN_ON(1); + return; + } + + __unregister_trace_kprobe(tk); + free_trace_kprobe(tk); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 6a4d3fa..75daff2 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -416,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, } extern int set_print_fmt(struct trace_probe *tp, bool is_return); + +#ifdef CONFIG_PERF_EVENTS +extern struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return); +extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); + +extern struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); +#endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 268029a..2014f43 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -static int register_uprobe_event(struct trace_uprobe *tu) +static inline void init_trace_event_call(struct trace_uprobe *tu, + struct trace_event_call *call) { - struct trace_event_call *call = &tu->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; + call->flags = TRACE_EVENT_FL_UPROBE; + call->class->reg = trace_uprobe_register; + call->data = tu; +} + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct trace_event_call *call = &tu->tp.call; + int ret = 0; + + init_trace_event_call(tu, call); + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; @@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - call->flags = TRACE_EVENT_FL_UPROBE; - call->class->reg = trace_uprobe_register; - call->data = tu; ret = trace_add_event_call(call); if (ret) { @@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) return 0; } +#ifdef CONFIG_PERF_EVENTS +struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +{ + struct trace_uprobe *tu; + struct inode *inode; + struct path path; + int ret; + + ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = igrab(d_inode(path.dentry)); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + iput(inode); + return ERR_PTR(-EINVAL); + } + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name "DUMMY_EVENT" here. + */ + tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, + is_return); + + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", + (int)PTR_ERR(tu)); + return ERR_CAST(tu); + } + + tu->offset = offs; + tu->inode = inode; + tu->filename = kstrdup(name, GFP_KERNEL); + init_trace_event_call(tu, &tu->tp.call); + + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ret = -ENOMEM; + goto error; + } + + return &tu->tp.call; +error: + free_trace_uprobe(tu); + return ERR_PTR(ret); +} + +void destroy_local_trace_uprobe(struct trace_event_call *event_call) +{ + struct trace_uprobe *tu; + + tu = container_of(event_call, struct trace_uprobe, tp.call); + + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; + + free_trace_uprobe(tu); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { diff --git a/kernel/uid16.c b/kernel/uid16.c index ef1da2a..af6925d 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -18,44 +18,46 @@ #include <linux/uaccess.h> +#include "uid16.h" + SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - return sys_chown(filename, low2highuid(user), low2highgid(group)); + return ksys_chown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - return sys_lchown(filename, low2highuid(user), low2highgid(group)); + return ksys_lchown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) { - return sys_fchown(fd, low2highuid(user), low2highgid(group)); + return ksys_fchown(fd, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) { - return sys_setregid(low2highgid(rgid), low2highgid(egid)); + return __sys_setregid(low2highgid(rgid), low2highgid(egid)); } SYSCALL_DEFINE1(setgid16, old_gid_t, gid) { - return sys_setgid(low2highgid(gid)); + return __sys_setgid(low2highgid(gid)); } SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) { - return sys_setreuid(low2highuid(ruid), low2highuid(euid)); + return __sys_setreuid(low2highuid(ruid), low2highuid(euid)); } SYSCALL_DEFINE1(setuid16, old_uid_t, uid) { - return sys_setuid(low2highuid(uid)); + return __sys_setuid(low2highuid(uid)); } SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) { - return sys_setresuid(low2highuid(ruid), low2highuid(euid), + return __sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); } @@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) { - return sys_setresgid(low2highgid(rgid), low2highgid(egid), + return __sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); } - SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) { const struct cred *cred = current_cred(); @@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) { - return sys_setfsuid(low2highuid(uid)); + return __sys_setfsuid(low2highuid(uid)); } SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) { - return sys_setfsgid(low2highgid(gid)); + return __sys_setfsgid(low2highgid(gid)); } static int groups16_to_user(old_gid_t __user *grouplist, diff --git a/kernel/uid16.h b/kernel/uid16.h new file mode 100644 index 0000000..cdca040 --- /dev/null +++ b/kernel/uid16.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_UID16_H +#define LINUX_UID16_H + +long __sys_setuid(uid_t uid); +long __sys_setgid(gid_t gid); +long __sys_setreuid(uid_t ruid, uid_t euid); +long __sys_setregid(gid_t rgid, gid_t egid); +long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid); +long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid); +long __sys_setfsuid(uid_t uid); +long __sys_setfsgid(gid_t gid); + +#endif /* LINUX_UID16_H */ diff --git a/kernel/umh.c b/kernel/umh.c index 18e5fa4..f76b3ff 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -118,7 +118,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) { @@ -135,7 +135,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) * * Thus the __user pointer cast is valid here. */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); + kernel_wait4(pid, (int __user *)&ret, 0, NULL); /* * If ret is 0, either call_usermodehelper_exec_async failed and diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6ec6ba6..254e636 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void) int __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |