From ea6f18ed5a1531caf678374f30a0990c9e6742f3 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 25 Nov 2008 02:35:02 +1030 Subject: sched: reduce stack size requirements in kernel/sched.c Impact: cleanup * use node_to_cpumask_ptr in place of node_to_cpumask to reduce stack requirements in sched.c Signed-off-by: Mike Travis Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bb82765..dd22cec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6110,8 +6110,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) do { /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); + node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); + + cpus_and(mask, *pnodemask, p->cpus_allowed); dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ @@ -7098,9 +7099,9 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *nodemask) { int group; + node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpus_and(*nodemask, *pnodemask, *cpu_map); group = first_cpu(*nodemask); if (sg) @@ -7150,9 +7151,9 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + node_to_cpumask_ptr(pnodemask, i); - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpus_and(*nodemask, *pnodemask, *cpu_map); if (cpus_empty(*nodemask)) continue; -- cgit v1.1 From abcd083a1a658d2bc1f7fced02632bfe03918002 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:02 +1030 Subject: sched: convert sched.c from for_each_cpu_mask to for_each_cpu. Impact: trivial API conversion This is a simple conversion, but note that for_each_cpu() terminates with i >= nr_cpu_ids, not i == NR_CPUS like for_each_cpu_mask() did. I don't convert all of them: sd->span changes in a later patch, so change those iterators there rather than here. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dd22cec..e59978e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2061,7 +2061,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2103,7 +2103,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, /* Traverse only the allowed CPUs */ cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu(i, tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -3121,7 +3121,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { struct rq *rq; if (!cpu_isset(i, *cpus)) @@ -3400,7 +3400,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -3942,7 +3942,7 @@ static void run_rebalance_domains(struct softirq_action *h) int balance_cpu; cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, &cpus) { /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -6906,7 +6906,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(*covered); - for_each_cpu_mask_nr(i, *span) { + for_each_cpu(i, span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; @@ -6917,7 +6917,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask_nr(j, *span) { + for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; @@ -7117,7 +7117,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask_nr(j, sg->cpumask) { + for_each_cpu(j, &sg->cpumask) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); @@ -7142,7 +7142,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; - for_each_cpu_mask_nr(cpu, *cpu_map) { + for_each_cpu(cpu, cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@ -7396,7 +7396,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7463,7 +7463,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { SCHED_CPUMASK_VAR(this_sibling_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7480,7 +7480,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7547,7 +7547,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { + for_each_cpu(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); @@ -7593,21 +7593,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(cpu_domains, i); init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(core_domains, i); init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(phys_domains, i); init_sched_groups_power(i, sd); @@ -7627,7 +7627,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -7709,7 +7709,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) cpumask_t tmpmask; int i; - for_each_cpu_mask_nr(i, *cpu_map) + for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); arch_destroy_sched_domains(cpu_map, &tmpmask); -- cgit v1.1 From 3404c8d97c2d3eb87b1bf4aadad957bfb5235b14 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:03 +1030 Subject: sched: get rid of boutique sched.c allocations, use cpumask_var_t. Impact: use new general API Using lots of allocs rather than one big alloc is less efficient, but who cares for this setup function? Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/sched.c | 139 +++++++++++++++++++++++---------------------------------- 1 file changed, 55 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e59978e..0dc9d57 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7263,48 +7263,6 @@ SD_INIT_FUNC(CPU) SD_INIT_FUNC(MC) #endif -/* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ -struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - -#ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; -#endif -}; - -#if NR_CPUS > 128 -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ - *masks = kmalloc(sizeof(**masks), GFP_KERNEL); -} -static inline void sched_cpumask_free(struct allmasks *masks) -{ - kfree(masks); -} -#else -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ } -static inline void sched_cpumask_free(struct allmasks *masks) -{ } -#endif - -#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) @@ -7347,14 +7305,35 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const cpumask_t *cpu_map, struct sched_domain_attr *attr) { - int i; + int i, err = -ENOMEM; struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; #ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ @@ -7362,33 +7341,16 @@ static int __build_sched_domains(const cpumask_t *cpu_map, GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; + goto free_tmpmask; } #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; + goto free_sched_groups; } - /* get space for all scratch cpumask variables */ - sched_cpumask_alloc(&allmasks); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } - - tmpmask = (cpumask_t *)allmasks; - - #ifdef CONFIG_NUMA sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif @@ -7398,7 +7360,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, */ for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); @@ -7464,9 +7425,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_sibling_map = per_cpu(cpu_sibling_map, i); cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); if (i != first_cpu(*this_sibling_map)) @@ -7481,9 +7439,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); if (i != first_cpu(*this_core_map)) @@ -7497,9 +7452,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); if (cpus_empty(*nodemask)) @@ -7513,8 +7465,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, send_covered, tmpmask); @@ -7523,9 +7473,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); int j; *nodemask = node_to_cpumask(i); @@ -7560,7 +7507,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, prev = sg; for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); @@ -7639,15 +7585,40 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpu_attach_domain(sd, rd, i); } - sched_cpumask_free(allmasks); - return 0; + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - sched_cpumask_free(allmasks); kfree(rd); - return -ENOMEM; + goto free_tmpmask; #endif } -- cgit v1.1 From 1e5ce4f4a755ee498bd9217dae26143afa0d8f31 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:03 +1030 Subject: sched: remove any_online_cpu() Impact: use new API any_online_cpu() is a good name, but it takes a cpumask_t, not a pointer. There are several places where any_online_cpu() doesn't really want a mask arg at all. Replace all callers with cpumask_any() and cpumask_any_and(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0dc9d57..a2de33d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5964,7 +5964,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -6113,11 +6113,12 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); cpus_and(mask, *pnodemask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); + dest_cpu = cpumask_any_and(cpu_online_mask, &mask); /* On any allowed CPU? */ if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, + &p->cpus_allowed); /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { @@ -6133,7 +6134,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ rq = task_rq_lock(p, &flags); p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, + &p->cpus_allowed); task_rq_unlock(rq, &flags); /* @@ -6159,7 +6161,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); unsigned long flags; local_irq_save(flags); @@ -6524,7 +6526,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; -- cgit v1.1 From 758b2cdc6f6a22c702bd8f2344382fb1270b2161 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: wrap sched_group and sched_domain cpumask accesses. Impact: trivial wrap of member accesses This eases the transition in the next patch. We also get rid of a temporary cpumask in find_idlest_cpu() thanks to for_each_cpu_and, and sched_balance_self() due to getting weight before setting sd to NULL. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 114 +++++++++++++++++++++++++-------------------------- kernel/sched_fair.c | 10 ++--- kernel/sched_rt.c | 3 +- kernel/sched_stats.h | 3 +- 4 files changed, 63 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a2de33d..575f38a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1501,7 +1501,7 @@ static int tg_shares_up(struct task_group *tg, void *data) struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu(i, sched_domain_span(sd)) { /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1522,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@ -2053,15 +2053,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu(i, &group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2093,17 +2095,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu(i, tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2145,7 +2144,6 @@ static int sched_balance_self(int cpu, int flag) update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -2154,14 +2152,13 @@ static int sched_balance_self(int cpu, int flag) continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2170,10 +2167,10 @@ static int sched_balance_self(int cpu, int flag) /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@ -2218,7 +2215,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@ -2266,7 +2263,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@ -3109,10 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3121,13 +3119,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu(i, &group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@ -3238,8 +3231,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3254,8 +3247,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3400,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu(i, &group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -3746,7 +3739,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@ -6618,7 +6611,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); + cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd)); cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6633,11 +6626,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT "span %s level %s\n", str, sd->name); - if (!cpu_isset(cpu, sd->span)) { + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpu_isset(cpu, group->cpumask)) { + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -6657,31 +6650,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpus_weight(group->cpumask)) { + if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } - if (cpus_intersects(*groupmask, group->cpumask)) { + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(*groupmask, *groupmask, group->cpumask); + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group)); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, *groupmask)) + if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6721,7 +6715,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) static int sd_degenerate(struct sched_domain *sd) { - if (cpus_weight(sd->span) == 1) + if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ @@ -6752,7 +6746,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (sd_degenerate(parent)) return 1; - if (!cpus_equal(sd->span, parent->span)) + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Does parent contain flags not in child? */ @@ -6913,10 +6907,10 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, *covered)) + if (cpumask_test_cpu(i, covered)) continue; - cpus_clear(sg->cpumask); + cpumask_clear(sched_group_cpus(sg)); sg->__cpu_power = 0; for_each_cpu(j, span) { @@ -6924,7 +6918,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, continue; cpu_set(j, *covered); - cpu_set(j, sg->cpumask); + cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; @@ -7119,11 +7113,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu(j, &sg->cpumask) { + for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { + if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each * physical package. @@ -7200,7 +7194,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != first_cpu(sd->groups->cpumask)) + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) return; child = sd->child; @@ -7372,7 +7366,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sd->span = *cpu_map; + cpumask_copy(sched_domain_span(sd), cpu_map); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7382,18 +7376,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(node_domains, i); SD_INIT(sd, NODE); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = p; if (p) p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); #endif p = sd; sd = &per_cpu(phys_domains, i); SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - sd->span = *nodemask; + cpumask_copy(sched_domain_span(sd), nodemask); sd->parent = p; if (p) p->child = sd; @@ -7404,8 +7399,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(core_domains, i); SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); - cpus_and(sd->span, sd->span, *cpu_map); + *sched_domain_span(sd) = cpu_coregroup_map(i); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7416,8 +7412,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(cpu_domains, i); SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -7503,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = *nodemask; + cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; cpus_or(*covered, *covered, *nodemask); prev = sg; @@ -7530,7 +7526,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sg->__cpu_power = 0; - sg->cpumask = *tmpmask; + cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; cpus_or(*covered, *covered, *tmpmask); prev->next = sg; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 98345e4..bba0040 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1024,7 +1024,6 @@ static void yield_task_fair(struct rq *rq) #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, struct task_struct *p) { - cpumask_t tmp; struct sched_domain *sd; int i; @@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p) if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { + for_each_cpu_and(i, sched_domain_span(sd), + &p->cpus_allowed) { + if (cpu_active(i) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@ -1240,7 +1238,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) * this_cpu and prev_cpu are present in: */ for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { + if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { this_sd = sd; break; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2bdd444..4cd813a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1017,7 +1017,8 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t domain_mask; int best_cpu; - cpus_and(domain_mask, sd->span, *lowest_mask); + cpumask_and(&domain_mask, sched_domain_span(sd), + lowest_mask); best_cpu = pick_optimal_cpu(this_cpu, &domain_mask); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a..ce34083 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, + *sched_domain_span(sd)); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { -- cgit v1.1 From 6c99e9ad47d9c082bd096f42fb49e397b05d58a8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: convert struct sched_group/sched_domain cpumask_ts to variable bitmaps Impact: (future) size reduction for large NR_CPUS. We move the 'cpumask' member of sched_group to the end, so when we kmalloc it we can do a minimal allocation: saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. Similar trick for 'span' in sched_domain. This isn't quite as good as converting to a cpumask_var_t, as some sched_groups are actually static, but it's safer: we don't have to figure out where to call alloc_cpumask_var/free_cpumask_var. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 65 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 575f38a..6b9606a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7006,18 +7006,33 @@ static void sched_domain_node_span(int node, cpumask_t *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + +/* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); + *sg = &per_cpu(sched_group_cpus, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -7026,8 +7041,8 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, * multi-core sched-domains: */ #ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_core); +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) @@ -7041,7 +7056,7 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpus_and(*mask, *mask, *cpu_map); group = first_cpu(*mask); if (sg) - *sg = &per_cpu(sched_group_core, group); + *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) @@ -7050,13 +7065,13 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *unused) { if (sg) - *sg = &per_cpu(sched_group_core, cpu); + *sg = &per_cpu(sched_group_core, cpu).sg; return cpu; } #endif -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_phys); +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, @@ -7075,7 +7090,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, group = cpu; #endif if (sg) - *sg = &per_cpu(sched_group_phys, group); + *sg = &per_cpu(sched_group_phys, group).sg; return group; } @@ -7089,7 +7104,7 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *nodemask) @@ -7101,7 +7116,7 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, group = first_cpu(*nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group); + *sg = &per_cpu(sched_group_allnodes, group).sg; return group; } @@ -7116,7 +7131,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); + sd = &per_cpu(phys_domains, j).sd; if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each @@ -7385,7 +7400,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif p = sd; - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), nodemask); @@ -7396,7 +7411,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC p = sd; - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); *sched_domain_span(sd) = cpu_coregroup_map(i); @@ -7409,7 +7424,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT p = sd; - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), @@ -7485,7 +7500,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sched_domain_node_span(i, domainspan); cpus_and(*domainspan, *domainspan, *cpu_map); - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); @@ -7518,7 +7534,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (cpus_empty(*tmpmask)) continue; - sg = kmalloc_node(sizeof(struct sched_group), + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING @@ -7538,21 +7555,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); + struct sched_domain *sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -7574,11 +7591,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; #else - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; #endif cpu_attach_domain(sd, rd, i); } -- cgit v1.1 From 6a7b3dc3440f7b5a9b67594af01ed562cdeb41e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: convert nohz_cpu_mask to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- kernel/sched.c | 7 +++++-- kernel/time/tick-sched.c | 10 +++++----- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index e503a00..c03ca3e 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } diff --git a/kernel/sched.c b/kernel/sched.c index 6b9606a..2723d7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5870,9 +5870,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@ -8274,6 +8274,9 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); + scheduler_running = 1; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9c..70f872c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void) if (!ts->tick_stopped) return; - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle) if ((long)delta_jiffies >= 1) { if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle) /* * sched tick not stopped! */ - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } @@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); /* * We stopped the tick in idle. Update process times would miss the -- cgit v1.1 From c6c4927b22a3514c6660f0e72c78716226bd3cc8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:05 +1030 Subject: sched: convert struct root_domain to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. def_root_domain is static, and so its masks are initialized with alloc_bootmem_cpumask_var. After that, alloc_cpumask_var is used. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 69 ++++++++++++++++++++++++++++++++++++++++--------------- kernel/sched_rt.c | 26 ++++++++++----------- 2 files changed, 64 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2723d7a..93309c3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -487,14 +487,14 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; @@ -6444,7 +6444,7 @@ static void set_rq_online(struct rq *rq) if (!rq->online) { const struct sched_class *class; - cpu_set(rq->cpu, rq->rd->online); + cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { @@ -6464,7 +6464,7 @@ static void set_rq_offline(struct rq *rq) class->rq_offline(rq); } - cpu_clear(rq->cpu, rq->rd->online); + cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } @@ -6505,7 +6505,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } @@ -6567,7 +6567,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); @@ -6768,6 +6768,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +static void free_rootdomain(struct root_domain *rd) +{ + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; @@ -6777,38 +6785,60 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->rd) { struct root_domain *old_rd = rq->rd; - if (cpu_isset(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); - cpu_clear(rq->cpu, old_rd->span); + cpumask_clear_cpu(rq->cpu, old_rd->span); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + free_rootdomain(old_rd); } atomic_inc(&rd->refcount); rq->rd = rd; - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } -static void init_rootdomain(struct root_domain *rd) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); - cpus_clear(rd->span); - cpus_clear(rd->online); + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto free_rd; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; cpupri_init(&rd->cpupri); + return 0; + +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +free_rd: + kfree(rd); + return -ENOMEM; } static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain); + init_rootdomain(&def_root_domain, true); + atomic_set(&def_root_domain.refcount, 1); } @@ -6820,7 +6850,10 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - init_rootdomain(rd); + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } return rd; } @@ -7632,7 +7665,7 @@ free_sched_groups: #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - kfree(rd); + free_rootdomain(rd); goto free_tmpmask; #endif } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4cd813a..820fc42 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq) if (!rq->online) return; - cpu_set(rq->cpu, rq->rd->rto_mask); + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set * the overload count. That is checked to determine @@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq) /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } static void update_rt_migration(struct rq *rq) @@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) } #ifdef CONFIG_SMP -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_rq(smp_processor_id())->rd->span; } #else -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } #endif @@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) return rt_rq->rt_throttled; } -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } static inline @@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq) int i, weight, more = 0; u64 rt_period; - weight = cpus_weight(rd->span); + weight = cpumask_weight(rd->span); spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq) /* * Greedy reclaim, take back as much as we can. */ - for_each_cpu_mask(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; - cpumask_t span; + const struct cpumask *span; if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@ -1181,7 +1181,7 @@ static int pull_rt_task(struct rq *this_rq) next = pick_next_task_rt(this_rq); - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; -- cgit v1.1 From 7d1e6a9b95e3edeac91888bc683ae62f18519432 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:09 +1030 Subject: sched: convert nohz struct to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 93309c3..2f8ea99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3758,10 +3758,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@ -3789,7 +3788,7 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@ -3803,7 +3802,7 @@ int select_nohz_load_balancer(int stop_tick) } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -3816,10 +3815,10 @@ int select_nohz_load_balancer(int stop_tick) } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@ -3930,12 +3929,13 @@ static void run_rebalance_domains(struct softirq_action *h) */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu(balance_cpu, &cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -3973,7 +3973,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@ -3986,7 +3986,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -3998,7 +3998,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@ -4008,7 +4008,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@ -8309,6 +8309,9 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif scheduler_running = 1; } -- cgit v1.1 From 4d2732c63e0c05cfef2a74868d08eace922dfc3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:10 +1030 Subject: sched: convert idle_balance() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2f8ea99..154a95f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3676,7 +3676,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3687,7 +3690,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3702,6 +3705,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* -- cgit v1.1 From a0e902452da16b79d7c9230630ed8a595d14fa85 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert rebalance_domains() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 154a95f..67383e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3850,7 +3850,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3875,7 +3879,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3909,6 +3913,8 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* -- cgit v1.1 From f17c860760927c2a8e41a021eab3317e4415e962 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert sys_sched_getaffinity() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Some jiggling here to make sure we always exit at the bottom (so we hit the free_cpumask_var there). Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 67383e7..6deff24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5499,19 +5499,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** -- cgit v1.1 From e76bd8d9850c2296a7e8e24c9dce9b5e6b55fe2f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: avoid stack var in move_task_off_dead_cpu Impact: stack usage reduction With some care, we can avoid needing a temporary cpumask (we can't really allocate here, since we can't fail). This version calls cpuset_cpus_allowed_locked() with the task_rq_lock held. I'm fairly sure this works, but there might be a deadlock hiding. And of course, we can't get rid of the last cpumask on stack until we can use cpumask_of_node instead of node_to_cpumask. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 78 +++++++++++++++++++++++++++------------------------------- 1 file changed, 36 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6deff24..f7dee20 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6112,52 +6112,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; - cpumask_t mask; struct rq *rq; int dest_cpu; + /* FIXME: Use cpumask_of_node here. */ + cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); + const struct cpumask *nodemask = &_nodemask; + +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rq = task_rq_lock(p, &flags); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + task_rq_unlock(rq, &flags); - do { - /* On same node? */ - node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); - - cpus_and(mask, *pnodemask, p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &mask); - - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = cpumask_any_and(cpu_online_mask, - &p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = cpumask_any_and(cpu_online_mask, - &p->cpus_allowed); - task_rq_unlock(rq, &flags); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* -- cgit v1.1 From 5a16f3d30ca4e3f166d691220c003066a14e32b5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert struct (sys_)sched_setaffinity() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space on the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Note the removal of the initializer of new_mask: since the first thing we did was "cpus_and(new_mask, new_mask, cpus_allowed)" I just changed that to "cpumask_and(new_mask, in_mask, cpus_allowed);". Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f7dee20..2d4ff91 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5378,8 +5378,7 @@ out_unlock: long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@ -5401,6 +5400,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) @@ -5410,24 +5417,28 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: put_task_struct(p); put_online_cpus(); return retval; @@ -5453,14 +5464,17 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } long sched_getaffinity(pid_t pid, cpumask_t *mask) -- cgit v1.1 From d5dd3db1dce73cdd5c45c5a3498c51bd21b8864b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert sched_domain_debug to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves stack space. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. In this case, we always alloced, but we don't need to any more. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2d4ff91..24012c2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6706,7 +6706,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_t *groupmask; + cpumask_var_t groupmask; int level = 0; if (!sd) { @@ -6716,8 +6716,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); return; } @@ -6730,7 +6729,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!sd) break; } - kfree(groupmask); + free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) -- cgit v1.1 From dcc30a35f71bcf51f1e9b336dc5e41923071509a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert cpu_isolated_map to cpumask_var_t. Impact: stack usage reduction, (future) size reduction, cleanup Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. We can also use cpulist_parse() instead of doing it manually in isolated_cpu_setup. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 24012c2..526618f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6917,19 +6917,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } /* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; +static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); + cpulist_parse(str, *cpu_isolated_map); return 1; } @@ -7727,7 +7720,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@ -7826,7 +7819,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -7985,7 +7978,9 @@ static int update_runtime(struct notifier_block *nfb, void __init sched_init_smp(void) { - cpumask_t non_isolated_cpus; + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -7994,10 +7989,10 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -8012,9 +8007,10 @@ void __init sched_init_smp(void) init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); } #else void __init sched_init_smp(void) @@ -8334,6 +8330,7 @@ void __init sched_init(void) #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); #endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); scheduler_running = 1; } -- cgit v1.1 From 4212823fb459eacc8098dd420bb68ebb9917989d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert falback_doms to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 526618f..42588ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7697,10 +7697,10 @@ static struct sched_domain_attr *dattr_cur; /* * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. */ -static cpumask_t fallback_doms; +static cpumask_var_t fallback_doms; void __attribute__((weak)) arch_update_cpu_topology(void) { @@ -7719,7 +7719,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) - doms_cur = &fallback_doms; + doms_cur = fallback_doms; cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); @@ -7818,7 +7818,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - doms_new = &fallback_doms; + doms_new = fallback_doms; cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -7838,7 +7838,7 @@ match2: } /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) + if (doms_cur != fallback_doms) kfree(doms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; @@ -8011,6 +8011,8 @@ void __init sched_init_smp(void) BUG(); sched_init_granularity(); free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); } #else void __init sched_init_smp(void) -- cgit v1.1 From 68e74568fbe5854952355e942acca51f138096d9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert struct cpupri_vec cpumask_var_t. Impact: stack usage reduction, (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. The fact cpupro_init is called both before and after the slab is available makes for an ugly parameter unfortunately. We also use cpumask_any_and to get rid of a temporary in cpupri_find. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++++-- kernel/sched_cpupri.c | 39 ++++++++++++++++++++++++++++----------- kernel/sched_cpupri.h | 5 +++-- 3 files changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 42588ad..94fa333 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6792,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + cpupri_cleanup(&rd->cpupri); + free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6834,7 +6836,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) alloc_bootmem_cpumask_var(&def_root_domain.span); alloc_bootmem_cpumask_var(&def_root_domain.online); alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri); + cpupri_init(&rd->cpupri, true); return 0; } @@ -6845,9 +6847,12 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - cpupri_init(&rd->cpupri); + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; return 0; +free_rto_mask: + free_cpumask_var(rd->rto_mask); free_online: free_cpumask_var(rd->online); free_span: diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 52154fe..018b7be 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -67,24 +67,21 @@ static int convert_prio(int prio) * Returns: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - cpumask_t *lowest_mask) + struct cpumask *lowest_mask) { int idx = 0; int task_pri = convert_prio(p->prio); for_each_cpupri_active(cp->pri_active, idx) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - cpumask_t mask; if (idx >= task_pri) break; - cpus_and(mask, p->cpus_allowed, vec->mask); - - if (cpus_empty(mask)) + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - *lowest_mask = mask; + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); return 1; } @@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); - cpu_clear(cpu, vec->mask); + cpumask_clear_cpu(cpu, vec->mask); spin_unlock_irqrestore(&vec->lock, flags); } @@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_lock_irqsave(&vec->lock, flags); - cpu_set(cpu, vec->mask); + cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); @@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context + * @bootmem: true if allocations need to use bootmem * - * Returns: (void) + * Returns: -ENOMEM if memory fails. */ -void cpupri_init(struct cpupri *cp) +int cpupri_init(struct cpupri *cp, bool bootmem) { int i; @@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp) spin_lock_init(&vec->lock); vec->count = 0; - cpus_clear(vec->mask); + if (bootmem) + alloc_bootmem_cpumask_var(&vec->mask); + else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + goto cleanup; } for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; + +cleanup: + for (i--; i >= 0; i--) + free_cpumask_var(cp->pri_to_cpu[i].mask); + return -ENOMEM; } +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ + int i; + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) + free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index f25811b0..642a94e 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -14,7 +14,7 @@ struct cpupri_vec { spinlock_t lock; int count; - cpumask_t mask; + cpumask_var_t mask; }; struct cpupri { @@ -27,7 +27,8 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, cpumask_t *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -void cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp, bool bootmem); +void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) #define cpupri_init() do { } while (0) -- cgit v1.1 From 24600ce89a819a8f2fb4fd69fd777218a82ade20 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert check_preempt_equal_prio to cpumask_var_t. Impact: stack reduction for large NR_CPUS Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves stack space. We simply return if the allocation fails: since we don't use it we could just pass NULL to cpupri_find and have it handle that. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 820fc42..1fa1362 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_t mask; + cpumask_var_t mask; if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, mask)) + goto free; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) + goto free; /* * There appears to be other cpus that can accept @@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); +free: + free_cpumask_var(mask); } #endif /* CONFIG_SMP */ -- cgit v1.1 From 0e3900e6d3b04c44737ebc505604dcd8ed30e354 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert local_cpu_mask to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_rt.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94fa333..f2be618 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,6 +8018,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); } #else void __init sched_init_smp(void) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1fa1362..1f0e99d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -962,7 +962,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) return next; } -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { @@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -1551,3 +1551,12 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ + +/* Note that this is never called for !SMP, but that's OK. */ +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} -- cgit v1.1 From 96f874e26428ab5d2db681c100210c254775e154 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:14 +1030 Subject: sched: convert remaining old-style cpumask operators Impact: Trivial API conversion NR_CPUS -> nr_cpu_ids cpumask_t -> struct cpumask sizeof(cpumask_t) -> cpumask_size() cpumask_a = cpumask_b -> cpumask_copy(&cpumask_a, &cpumask_b) cpu_set() -> cpumask_set_cpu() first_cpu() -> cpumask_first() cpumask_of_cpu() -> cpumask_of() cpus_* -> cpumask_* There are some FIXMEs where we all archs to complete infrastructure (patches have been sent): cpu_coregroup_map -> cpu_coregroup_mask node_to_cpumask* -> cpumask_of_node There is also one FIXME where we pass an array of cpumasks to partition_sched_domains(): this implies knowing the definition of 'struct cpumask' and the size of a cpumask. This will be fixed in a future patch. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 212 ++++++++++++++++++++++++++++------------------------ kernel/sched_fair.c | 4 +- kernel/sched_rt.c | 18 ++--- 3 files changed, 124 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f2be618..eba6a15 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2829,7 +2829,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; @@ -2895,7 +2895,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@ -3070,7 +3070,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3387,7 +3387,7 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -3396,7 +3396,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@ -3426,7 +3426,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -3434,7 +3434,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3494,8 +3494,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@ -3512,7 +3512,8 @@ redo: /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@ -3587,7 +3588,7 @@ out: */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3596,7 +3597,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3640,8 +3641,8 @@ redo: double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } @@ -5376,7 +5377,7 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; @@ -5445,13 +5446,13 @@ out_put_task: } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@ -5477,7 +5478,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return retval; } -long sched_getaffinity(pid_t pid, cpumask_t *mask) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@ -5494,7 +5495,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@ -5872,7 +5873,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -5956,7 +5957,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@ -5964,13 +5965,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@ -5978,12 +5979,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { @@ -6028,7 +6029,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@ -6629,13 +6630,13 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) + struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd)); - cpus_clear(*groupmask); + cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6936,24 +6937,25 @@ __setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) { struct sched_group *first = NULL, *last = NULL; int i; - cpus_clear(*covered); + cpumask_clear(covered); for_each_cpu(i, span) { struct sched_group *sg; @@ -6970,7 +6972,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, *covered); + cpumask_set_cpu(j, covered); cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) @@ -7035,9 +7037,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static void sched_domain_node_span(int node, cpumask_t *span) +static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; + /* FIXME: use cpumask_of_node() */ node_to_cpumask_ptr(nodemask, node); int i; @@ -7081,8 +7084,8 @@ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu).sg; @@ -7100,22 +7103,21 @@ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu).sg; @@ -7127,18 +7129,18 @@ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; #ifdef CONFIG_SCHED_MC + /* FIXME: Use cpu_coregroup_mask. */ *mask = cpu_coregroup_map(cpu); cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); #else group = cpu; #endif @@ -7159,14 +7161,16 @@ static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { int group; + /* FIXME: use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - cpus_and(*nodemask, *pnodemask, *cpu_map); - group = first_cpu(*nodemask); + cpumask_and(nodemask, pnodemask, cpu_map); + group = cpumask_first(nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group).sg; @@ -7202,7 +7206,8 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { int cpu, i; @@ -7215,10 +7220,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, i); cpus_and(*nodemask, *pnodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; if (sg == NULL) @@ -7236,7 +7242,8 @@ next_sg: } } #else /* !CONFIG_NUMA */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { } #endif /* CONFIG_NUMA */ @@ -7366,7 +7373,7 @@ static void set_domain_attribute(struct sched_domain *sd, * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const cpumask_t *cpu_map, +static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { int i, err = -ENOMEM; @@ -7416,7 +7423,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, } #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; #endif /* @@ -7425,12 +7432,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; + /* FIXME: use cpumask_of_node */ *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); @@ -7491,9 +7499,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, @@ -7505,9 +7513,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { + /* FIXME: Use cpu_coregroup_mask */ *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) + if (i != cpumask_first(this_core_map)) continue; init_sched_build_groups(this_core_map, cpu_map, @@ -7518,9 +7527,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; init_sched_build_groups(nodemask, cpu_map, @@ -7541,17 +7551,18 @@ static int __build_sched_domains(const cpumask_t *cpu_map, struct sched_group *sg, *prev; int j; + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); - cpus_clear(*covered); + cpumask_clear(covered); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { + if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; } sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); + cpumask_and(domainspan, domainspan, cpu_map); sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, i); @@ -7570,21 +7581,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sg->__cpu_power = 0; cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; - cpus_or(*covered, *covered, *nodemask); + cpumask_or(covered, covered, nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, n); - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) break; - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) + cpumask_and(tmpmask, tmpmask, pnodemask); + if (cpumask_empty(tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group) + @@ -7598,7 +7610,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sg->__cpu_power = 0; cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); + cpumask_or(covered, covered, tmpmask); prev->next = sg; prev = sg; } @@ -7634,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, tmpmask); init_numa_sched_groups_power(sg); } @@ -7690,12 +7702,12 @@ error: #endif } -static int build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const struct cpumask *cpu_map) { return __build_sched_domains(cpu_map, NULL); } -static cpumask_t *doms_cur; /* current sched domains */ +static struct cpumask *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -7716,13 +7728,13 @@ void __attribute__((weak)) arch_update_cpu_topology(void) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const struct cpumask *cpu_map) { int err; arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms_cur) doms_cur = fallback_doms; cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); @@ -7733,8 +7745,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); } @@ -7743,15 +7755,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map, * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ -static void detach_destroy_domains(const cpumask_t *cpu_map) +static void detach_destroy_domains(const struct cpumask *cpu_map) { - cpumask_t tmpmask; + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@ -7776,7 +7789,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the @@ -7790,13 +7803,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * the single partition 'fallback_doms', it also forces the domains * to be rebuilt. * - * If doms_new == NULL it will be replaced with cpu_online_map. + * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; @@ -7811,7 +7825,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) + if (cpumask_equal(&doms_cur[i], &doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } @@ -7831,7 +7845,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) + if (cpumask_equal(&doms_new[i], &doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bba0040..08ffffd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1017,7 +1017,7 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) + * hence we need to mask them out (cpu_active_mask) * * Returns the CPU we should wake onto. */ @@ -1244,7 +1244,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) goto out; /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1f0e99d..fb39645 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -923,7 +923,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -997,7 +997,7 @@ static int find_lowest_rq(struct task_struct *task) * I guess we might want to change cpupri_find() to ignore those * in the first place. */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * At this point we have built a mask of cpus representing the @@ -1007,7 +1007,7 @@ static int find_lowest_rq(struct task_struct *task) * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ - if (cpu_isset(cpu, *lowest_mask)) + if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* @@ -1064,8 +1064,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || task_running(rq, task) || !task->se.on_rq)) { @@ -1315,9 +1315,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, } static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - int weight = cpus_weight(*new_mask); + int weight = cpumask_weight(new_mask); BUG_ON(!rt_task(p)); @@ -1338,7 +1338,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, update_rt_migration(rq); } - p->cpus_allowed = *new_mask; + cpumask_copy(&p->cpus_allowed, new_mask); p->rt.nr_cpus_allowed = weight; } -- cgit v1.1 From bf4d83f66476086c6b50dc52aac00d71ad70494e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:57:51 +1030 Subject: sched: convert nohz struct to cpumask_var_t, fix Impact: build fix Fix the !CONFIG_SMP case. Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index eba6a15..1aa840a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8349,10 +8349,12 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ scheduler_running = 1; } -- cgit v1.1 From 3d8cbdf8650f44d95333ca645d950832a0653f35 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:58:41 +1030 Subject: sched: convert local_cpu_mask to cpumask_var_t, fix Impact: build fix for !CONFIG_SMP Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fb39645..94aab72 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1381,6 +1381,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, if (!rq->rt.rt_nr_running) pull_rt_task(rq); } + +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} #endif /* CONFIG_SMP */ /* @@ -1552,11 +1560,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) } #endif /* CONFIG_SCHED_DEBUG */ -/* Note that this is never called for !SMP, but that's OK. */ -static inline void init_sched_rt_class(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); -} -- cgit v1.1 From 1224e376f2a7e3c7ab19ef37099a78597978a696 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:59:20 +1030 Subject: sched: avoid stack var in move_task_off_dead_cpu, fix Impact: locking fix We can't call cpuset_cpus_allowed_locked() with the rq lock held. However, the rq lock merely protects us from (1) cpu_online_mask changing and (2) someone else changing p->cpus_allowed. The first can't happen because we're being called from a cpu hotplug notifier. The second doesn't really matter: we are forcing the task off a CPU it was affine to, so we're not doing very well anyway. So we remove the rq lock from this path, and all is good. Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1aa840a..3f5bfdc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6126,8 +6126,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - struct rq *rq; int dest_cpu; /* FIXME: Use cpumask_of_node here. */ cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); @@ -6146,10 +6144,8 @@ again: /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { - rq = task_rq_lock(p, &flags); cpuset_cpus_allowed_locked(p, &p->cpus_allowed); dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - task_rq_unlock(rq, &flags); /* * Don't tell them about moving exiting tasks or -- cgit v1.1 From 98a79d6a50181ca1ecf7400eda01d5dc1bc0dbf0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:19:41 +1030 Subject: cpumask: centralize cpu_online_map and cpu_possible_map Impact: cleanup Each SMP arch defines these themselves. Move them to a central location. Twists: 1) Some archs (m32, parisc, s390) set possible_map to all 1, so we add a CONFIG_INIT_ALL_POSSIBLE for this rather than break them. 2) mips and sparc32 '#define cpu_possible_map phys_cpu_present_map'. Those archs simply have phys_cpu_present_map replaced everywhere. 3) Alpha defined cpu_possible_map to cpu_present_map; this is tricky so I just manipulate them both in sync. 4) IA64, cris and m32r have gratuitous 'extern cpumask_t cpu_possible_map' declarations. Signed-off-by: Rusty Russell Reviewed-by: Grant Grundler Tested-by: Tony Luck Acked-by: Ingo Molnar Cc: Mike Travis Cc: ink@jurassic.park.msu.ru Cc: rmk@arm.linux.org.uk Cc: starvik@axis.com Cc: tony.luck@intel.com Cc: takata@linux-m32r.org Cc: ralf@linux-mips.org Cc: grundler@parisc-linux.org Cc: paulus@samba.org Cc: schwidefsky@de.ibm.com Cc: lethal@linux-sh.org Cc: wli@holomorphy.com Cc: davem@davemloft.net Cc: jdike@addtoit.com Cc: mingo@redhat.com --- kernel/cpu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ea32e8..bae131a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,19 +24,20 @@ cpumask_t cpu_present_map __read_mostly; EXPORT_SYMBOL(cpu_present_map); -#ifndef CONFIG_SMP - /* * Represents all cpu's that are currently online. */ -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); +#ifdef CONFIG_INIT_ALL_POSSIBLE cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +#else +cpumask_t cpu_possible_map __read_mostly; +#endif EXPORT_SYMBOL(cpu_possible_map); -#else /* CONFIG_SMP */ - +#ifdef CONFIG_SMP /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); -- cgit v1.1 From 29c0177e6a4ac094302bed54a1d4bbb6b740a9ef Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:25 +1030 Subject: cpumask: change cpumask_scnprintf, cpumask_parse_user, cpulist_parse, and cpulist_scnprintf to take pointers. Impact: change calling convention of existing cpumask APIs Most cpumask functions started with cpus_: these have been replaced by cpumask_ ones which take struct cpumask pointers as expected. These four functions don't have good replacement names; fortunately they're rarely used, so we just change them over. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Cc: paulus@samba.org Cc: mingo@redhat.com Cc: tony.luck@intel.com Cc: ralf@linux-mips.org Cc: Greg Kroah-Hartman Cc: cl@linux-foundation.org Cc: srostedt@redhat.com --- kernel/cpuset.c | 4 ++-- kernel/irq/proc.c | 4 ++-- kernel/profile.c | 4 ++-- kernel/sched.c | 4 ++-- kernel/sched_stats.h | 2 +- kernel/taskstats.c | 2 +- kernel/trace/trace.c | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 96c0ba1..39c1a4c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (!*buf) { cpus_clear(trialcs.cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs.cpus_allowed); + retval = cpulist_parse(buf, &trialcs.cpus_allowed); if (retval < 0) return retval; @@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) mask = cs->cpus_allowed; mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, mask); + return cpulist_scnprintf(page, PAGE_SIZE, &mask); } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d257e7d..f293349 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -47,7 +47,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, irq_balancing_disabled(irq)) return -EIO; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; @@ -95,7 +95,7 @@ static ssize_t default_affinity_write(struct file *file, cpumask_t new_value; int err; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; diff --git a/kernel/profile.c b/kernel/profile.c index dc41827..7d620df 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,7 +442,7 @@ void profile_tick(int type) static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); + int len = cpumask_scnprintf(page, count, (cpumask_t *)data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file, unsigned long full_count = count, err; cpumask_t new_value; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; diff --git a/kernel/sched.c b/kernel/sched.c index e4bb1dd..d2d16d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6666,7 +6666,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); + cpulist_scnprintf(str, sizeof(str), &sd->span); cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6720,7 +6720,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpus_or(*groupmask, *groupmask, group->cpumask); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), &group->cpumask); printk(KERN_CONT " %s", str); group = group->next; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a..6beff1e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, &sd->span); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bd6be76..6d7dc4e 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask) if (!data) return -ENOMEM; nla_strlcpy(data, na, len); - ret = cpulist_parse(data, *mask); + ret = cpulist_parse(data, mask); kfree(data); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e325..d2e7547 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2126,7 +2126,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2147,7 +2147,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, int err, cpu; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); if (err) goto err_unlock; -- cgit v1.1 From 0de26520c7cabf36e1de090ea8092f011a6106ce Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:26 +1030 Subject: cpumask: make irq_set_affinity() take a const struct cpumask Impact: change existing irq_chip API Not much point with gentle transition here: the struct irq_chip's setaffinity method signature needs to change. Fortunately, not widely used code, but hits a few architectures. Note: In irq_select_affinity() I save a temporary in by mangling irq_desc[irq].affinity directly. Ingo, does this break anything? (Folded in fix from KOSAKI Motohiro) Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Reviewed-by: Grant Grundler Acked-by: Ingo Molnar Cc: ralf@linux-mips.org Cc: grundler@parisc-linux.org Cc: jeremy@xensource.com Cc: KOSAKI Motohiro --- kernel/irq/chip.c | 2 +- kernel/irq/manage.c | 22 ++++++++++------------ kernel/irq/migration.c | 14 +++++++------- kernel/irq/proc.c | 29 +++++++++++++++++++---------- kernel/time/tick-common.c | 6 +++--- 5 files changed, 40 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 10b5092..58d8e31 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -45,7 +45,7 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpus_setall(desc->affinity); + cpumask_setall(&desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 801addd..10ad2f8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq) * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = cpumask; + cpumask_copy(&desc->pending_mask, cpumask); } #else - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) */ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { - cpumask_t mask; - if (!irq_can_set_affinity(irq)) return 0; - cpus_and(mask, cpu_online_map, irq_default_affinity); - /* * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpus_intersects(desc->affinity, cpu_online_map)) - mask = desc->affinity; + if (cpumask_any_and(&desc->affinity, cpu_online_mask) + < nr_cpu_ids) + goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - desc->affinity = mask; - desc->chip->set_affinity(irq, mask); + cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); +set_affinity: + desc->chip->set_affinity(irq, &desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9db681d..bd72329 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,7 +4,6 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); - cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -19,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(desc->pending_mask))) + if (unlikely(cpumask_empty(&desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -27,8 +26,6 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, desc->pending_mask, cpu_online_map); - /* * If there was a valid mask to work with, please * do the disable, re-program, enable sequence. @@ -41,10 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(!cpus_empty(tmp))) { - desc->chip->set_affinity(irq,tmp); + if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) + < nr_cpu_ids)) { + cpumask_and(&desc->affinity, + &desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, &desc->affinity); } - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f293349..8e91c97 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; - cpumask_t new_value; + cpumask_var_t new_value; int err; if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; - err = cpumask_parse_user(buffer, count, &new_value); + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto free_cpumask; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(*new_value)) { + err = -EINVAL; + goto free_cpumask; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) + if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity_usr(irq) ? -EINVAL : count; - - irq_set_affinity(irq, new_value); + err = irq_select_affinity_usr(irq) ? -EINVAL : count; + } else { + irq_set_affinity(irq, new_value); + err = count; + } - return count; +free_cpumask: + free_cpumask_var(new_value); + return err; } static int irq_affinity_proc_open(struct inode *inode, struct file *file) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df12434..ab65d21 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, - const cpumask_t *cpumask) + const struct cpumask *cpumask) { ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; @@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpus_equal(newdev->cpumask, *cpumask)) - irq_set_affinity(newdev->irq, *cpumask); + if (!cpumask_equal(&newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current -- cgit v1.1 From 320ab2b0b1e08e3805a3e1084a2f0eb1938d5d67 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:26 +1030 Subject: cpumask: convert struct clock_event_device to cpumask pointers. Impact: change calling convention of existing clock_event APIs struct clock_event_timer's cpumask field gets changed to take pointer, as does the ->broadcast function. Another single-patch change. For safety, we BUG_ON() in clockevents_register_device() if it's not set. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- kernel/time/clockevents.c | 2 ++ kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 8 ++++---- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f8d9680..ea2f48a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -166,6 +166,8 @@ static void clockevents_notify_released(void) void clockevents_register_device(struct clock_event_device *dev) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(!dev->cpumask); + /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f98a1b7..9590af2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask) */ cpu = first_cpu(mask); td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(mask); + td->evtdev->broadcast(&mask); } } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index ab65d21..f8372be 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -171,7 +171,7 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpumask_equal(&newdev->cpumask, cpumask)) + if (!cpumask_equal(newdev->cpumask, cpumask)) irq_set_affinity(newdev->irq, cpumask); /* @@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - if (!cpu_isset(cpu, newdev->cpumask)) + if (!cpumask_test_cpu(cpu, newdev->cpumask)) goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) goto out_bc; } -- cgit v1.1 From afb8a9b70b86866a60e08b2956ae4e1406390336 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 18 Dec 2008 23:26:09 +0530 Subject: sched: framework for sched_mc/smt_power_savings=N Impact: extend range of /sys/devices/system/cpu/sched_mc_power_savings Currently the sched_mc/smt_power_savings variable is a boolean, which either enables or disables topology based power savings. This patch extends the behaviour of the variable from boolean to multivalued, such that based on the value, we decide how aggressively do we want to perform powersavings balance at appropriate sched domain based on topology. Variable levels of power saving tunable would benefit end user to match the required level of power savings vs performance trade-off depending on the system configuration and workloads. This version makes the sched_mc_power_savings global variable to take more values (0,1,2). Later versions can have a single tunable called sched_power_savings instead of sched_{mc,smt}_power_savings. Signed-off-by: Gautham R Shenoy Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b309027..56b285c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7906,14 +7906,25 @@ int arch_reinit_sched_domains(void) static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { int ret; + unsigned int level = 0; - if (buf[0] != '0' && buf[0] != '1') + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) return -EINVAL; if (smt) - sched_smt_power_savings = (buf[0] == '1'); + sched_smt_power_savings = level; else - sched_mc_power_savings = (buf[0] == '1'); + sched_mc_power_savings = level; ret = arch_reinit_sched_domains(); -- cgit v1.1 From d5679bd11916eba5c8ee9033003e1a5ce56ece9a Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:16 +0530 Subject: sched: favour lower logical cpu number for sched_mc balance Impact: change load-balancing direction to match that of irqbalanced Just in case two groups have identical load, prefer to move load to lower logical cpu number rather than the present logic of moving to higher logical number. find_busiest_group() tries to look for a group_leader that has spare capacity to take more tasks and freeup an appropriate least loaded group. Just in case there is a tie and the load is equal, then the group with higher logical number is favoured. This conflicts with user space irqbalance daemon that will move interrupts to lower logical number if the system utilisation is very low. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 56b285c..94b9d11 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3241,7 +3241,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group)) > cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; @@ -3257,7 +3257,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group)) < cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; -- cgit v1.1 From 7a09b1a27b1e5a4957e4af9951420fea02c44fba Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:22 +0530 Subject: sched: nominate preferred wakeup cpu Impact: extend load-balancing code (no change in behavior yet) When the system utilisation is low and more cpus are idle, then the process waking up from sleep should prefer to wakeup an idle cpu from semi-idle cpu package (multi core package) rather than a completely idle cpu package which would waste power. Use the sched_mc balance logic in find_busiest_group() to nominate a preferred wakeup cpu. This info can be stored in appropriate sched_domain, but updating this info in all copies of sched_domain is not practical. Hence this information is stored in root_domain struct which is one copy per partitioned sched domain. The root_domain can be accessed from each cpu's runqueue and there is one copy per partitioned sched domain. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94b9d11..c1b8b30 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -509,6 +509,14 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif }; /* @@ -3384,6 +3392,10 @@ out_balanced: if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + first_cpu(group_leader->cpumask); + } return group_min; } #endif -- cgit v1.1 From 7eb52dfa70dbf5232b5b83ec4357e6bebaa8fde8 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:29 +0530 Subject: sched: bias task wakeups to preferred semi-idle packages Impact: tweak task wakeup to save power more agressively Preferred wakeup cpu (from a semi idle package) has been nominated in find_busiest_group() in the previous patch. Use this information in sched_mc_preferred_wakeup_cpu in function wake_idle() to bias task wakeups if the following conditions are satisfied: - The present cpu that is trying to wakeup the process is idle and waking the target process on this cpu will potentially wakeup a completely idle package - The previous cpu on which the target process ran is also idle and hence selecting the previous cpu may wakeup a semi idle cpu package - The task being woken up is allowed to run in the nominated cpu (cpu affinity and restrictions) Basically if both the current cpu and the previous cpu on which the task ran is idle, select the nominated cpu from semi idle cpu package for running the new task that is waking up. Cache hotness is considered since the actual biasing happens in wake_idle() only if the application is cache cold. This technique will effectively move short running bursty jobs in a mostly idle system. Wakeup biasing for power savings gets automatically disabled if system utilisation increases due to the fact that the probability of finding both this_cpu and prev_cpu idle decreases. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ffffd..36b5e34 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1026,6 +1026,24 @@ static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; + unsigned int chosen_wakeup_cpu; + int this_cpu; + + /* + * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu + * are idle and this is not a kernel thread and this task's affinity + * allows it to be moved to preferred cpu, then just move! + */ + + this_cpu = smp_processor_id(); + chosen_wakeup_cpu = + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && + idle_cpu(cpu) && idle_cpu(this_cpu) && + p->mm && !(p->flags & PF_KTHREAD) && + cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) + return chosen_wakeup_cpu; /* * If it is idle, then it is the best cpu to run this task. -- cgit v1.1 From ad273b32e482cdef306eac32b28d97f513a022f4 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:36 +0530 Subject: sched: activate active load balancing in new idle cpus Impact: tweak task balancing to save power more agressively Active load balancing is a process by which migration thread is woken up on the target CPU in order to pull current running task on another package into this newly idle package. This method is already in use with normal load_balance(), this patch introduces this method to new idle cpus when sched_mc is set to POWERSAVINGS_BALANCE_WAKEUP. This logic provides effective consolidation of short running daemon jobs in a almost idle system The side effect of this patch may be ping-ponging of tasks if the system is moderately utilised. May need to adjust the iterations before triggering. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1b8b30..8fc0d5a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3670,10 +3670,64 @@ redo: } if (!ld_moved) { + int active_balance; + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + if (active_balance) + wake_up_process(busiest->migration_thread); + } else sd->nr_balance_failed = 0; -- cgit v1.1 From 9924da434a13668fceb208d56dbdf86d166862cc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 00:53:40 +0100 Subject: sched: fix warning in kernel/sched.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix cpumask conversion bug this warning: kernel/sched.c: In function ‘find_busiest_group’: kernel/sched.c:3429: warning: passing argument 1 of ‘__first_cpu’ from incompatible pointer type shows that we forgot to convert a new patch to the new cpumask APIs. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8fc0d5a..ae5ca3f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3394,7 +3394,7 @@ out_balanced: *imbalance = min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - first_cpu(group_leader->cpumask); + cpumask_first(sched_group_cpus(group_leader)); } return group_min; } -- cgit v1.1 From 36dffab679c7eeb91c2507400cf4da6e9e01164e Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Sat, 20 Dec 2008 10:06:38 +0530 Subject: sched: nominate preferred wakeup cpu, fix Andrew Morton reported: > kernel/sched.c: In function 'schedule': > kernel/sched.c:3679: warning: 'active_balance' may be used uninitialized in this function > > This warning is correct - the code is buggy. In sched.c load_balance_newidle, there's real potential use of uninitialised variable - fix it. Signed-off-by: Vaidyanathan Srinivasan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ae5ca3f..756d981 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3670,7 +3670,7 @@ redo: } if (!ld_moved) { - int active_balance; + int active_balance = 0; schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && -- cgit v1.1