diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 6 | ||||
-rw-r--r-- | kernel/auditfilter.c | 3 | ||||
-rw-r--r-- | kernel/cpuset.c | 10 | ||||
-rw-r--r-- | kernel/futex.c | 93 | ||||
-rw-r--r-- | kernel/kgdb.c | 3 | ||||
-rw-r--r-- | kernel/kprobes.c | 15 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 70 | ||||
-rw-r--r-- | kernel/sched_rt.c | 66 | ||||
-rw-r--r-- | kernel/sched_stats.h | 6 | ||||
-rw-r--r-- | kernel/softlockup.c | 15 |
11 files changed, 202 insertions, 87 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index e8692a5..e092f1c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -738,7 +738,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb), msg_type); + err = audit_filter_user(&NETLINK_CB(skb)); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { @@ -779,7 +779,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; @@ -798,7 +798,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0e0bd27e..98c50cc 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1544,6 +1544,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @data: payload data * @datasz: size of payload data * @loginuid: loginuid of sender + * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, @@ -1720,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb, int type) +int audit_filter_user(struct netlink_skb_parms *cb) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 039baa4..9fceb97 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1037,8 +1037,8 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { - if ((int)val < 0) - val = -1; + if (val < -1 || val >= SD_LV_MAX) + return -EINVAL; if (val != cs->relax_domain_level) { cs->relax_domain_level = val; @@ -1890,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } diff --git a/kernel/futex.c b/kernel/futex.c index 449def8..7d1136e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 79e3c90..3ec23c3 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -void kgdb_console_write(struct console *co, const char *s, unsigned count) +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) { unsigned long flags; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1e0250c..d4998f8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -699,8 +699,9 @@ static int __register_kprobes(struct kprobe **kps, int num, return -EINVAL; for (i = 0; i < num; i++) { ret = __register_kprobe(kps[i], called_from); - if (ret < 0 && i > 0) { - unregister_kprobes(kps, i); + if (ret < 0) { + if (i > 0) + unregister_kprobes(kps, i); break; } } @@ -776,8 +777,9 @@ static int __register_jprobes(struct jprobe **jps, int num, jp->kp.break_handler = longjmp_break_handler; ret = __register_kprobe(&jp->kp, called_from); } - if (ret < 0 && i > 0) { - unregister_jprobes(jps, i); + if (ret < 0) { + if (i > 0) + unregister_jprobes(jps, i); break; } } @@ -920,8 +922,9 @@ static int __register_kretprobes(struct kretprobe **rps, int num, return -EINVAL; for (i = 0; i < num; i++) { ret = __register_kretprobe(rps[i], called_from); - if (ret < 0 && i > 0) { - unregister_kretprobes(rps, i); + if (ret < 0) { + if (i > 0) + unregister_kretprobes(rps, i); break; } } diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf19..5e02b774 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -217,8 +217,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - void __rcu_read_lock(void) { int idx; diff --git a/kernel/sched.c b/kernel/sched.c index bfb8ad8..3aaa5c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -312,12 +312,15 @@ static DEFINE_SPINLOCK(task_group_lock); #endif /* - * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. * (The default weight is 1024 - so there's no practical * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (ULONG_MAX - 1) +#define MAX_SHARES (1UL << 18) static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -1124,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } +#ifdef CONFIG_SMP static void hotplug_hrtick_disable(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -1179,6 +1183,7 @@ static void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } +#endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) { @@ -1337,8 +1342,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - if (!lw->inv_weight) - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } tmp = (u64)delta_exec * weight; /* @@ -4159,12 +4169,10 @@ need_resched_nonpreemptible: clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - signal_pending(prev))) { + if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - } else { + else deactivate_task(rq, prev, 1); - } switch_count = &prev->nvcsw; } @@ -4390,22 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) signal_pending(current)) || (state == TASK_KILLABLE && fatal_signal_pending(current))) { - __remove_wait_queue(&x->wait, &wait); - return -ERESTARTSYS; + timeout = -ERESTARTSYS; + break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - return timeout; - } - } while (!x->done); + } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; } x->done--; - return timeout; + return timeout ?: 1; } static long __sched @@ -6871,7 +6877,12 @@ static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) { - default_relax_domain_level = simple_strtoul(str, NULL, 0); + unsigned long val; + + val = simple_strtoul(str, NULL, 0); + if (val < SD_LV_MAX) + default_relax_domain_level = val; + return 1; } __setup("relax_domain_level=", setup_relax_domain_level); @@ -7230,6 +7241,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) } /* + * Free current domain masks. + * Called after all cpus are attached to NULL domain. + */ +static void free_sched_domains(void) +{ + ndoms_cur = 0; + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = &fallback_doms; +} + +/* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. @@ -7376,6 +7399,7 @@ int arch_reinit_sched_domains(void) get_online_cpus(); mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); + free_sched_domains(); err = arch_init_sched_domains(&cpu_online_map); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -7461,6 +7485,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); + free_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -7479,8 +7504,16 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_DONE; } +#ifndef CONFIG_CPUSETS + /* + * Create default domain partitioning if cpusets are disabled. + * Otherwise we let cpusets rebuild the domains based on the + * current setup. + */ + /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(&cpu_online_map); +#endif return NOTIFY_OK; } @@ -7620,7 +7653,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, else rt_se->rt_rq = parent->my_q; - rt_se->rt_rq = &rq->rt; rt_se->my_q = rt_rq; rt_se->parent = parent; INIT_LIST_HEAD(&rt_se->run_list); @@ -8342,7 +8374,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_CGROUP_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_group *tgi, *parent = tg->parent; + struct task_group *tgi, *parent = tg ? tg->parent : NULL; unsigned long total = 0; if (!parent) { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3432d57..0f3c191 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); @@ -449,13 +450,19 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #endif } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se) +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); - if (group_rq && rt_rq_throttled(group_rq)) + /* + * Don't enqueue the group if its throttled, or when empty. + * The latter is a consequence of the former when a child group + * get throttled and the current group doesn't have any other + * active members. + */ + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); @@ -464,7 +471,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) inc_rt_tasks(rt_se, rt_rq); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -480,11 +487,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct task_struct *p) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se) { - struct sched_rt_entity *rt_se, *back = NULL; + struct sched_rt_entity *back = NULL; - rt_se = &p->rt; for_each_sched_rt_entity(rt_se) { rt_se->back = back; back = rt_se; @@ -492,7 +498,26 @@ static void dequeue_rt_stack(struct task_struct *p) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se); + } +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + for_each_sched_rt_entity(rt_se) + __enqueue_rt_entity(rt_se); +} + +static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + + for_each_sched_rt_entity(rt_se) { + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq && rt_rq->rt_nr_running) + __enqueue_rt_entity(rt_se); } } @@ -506,32 +531,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) if (wakeup) rt_se->timeout = 0; - dequeue_rt_stack(p); - - /* - * enqueue everybody, bottom - up. - */ - for_each_sched_rt_entity(rt_se) - enqueue_rt_entity(rt_se); + enqueue_rt_entity(rt_se); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) { struct sched_rt_entity *rt_se = &p->rt; - struct rt_rq *rt_rq; update_curr_rt(rq); - - dequeue_rt_stack(p); - - /* - * re-enqueue all non-empty rt_rq entities. - */ - for_each_sched_rt_entity(rt_se) { - rt_rq = group_rt_rq(rt_se); - if (rt_rq && rt_rq->rt_nr_running) - enqueue_rt_entity(rt_se); - } + dequeue_rt_entity(rt_se); } /* @@ -542,8 +550,10 @@ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; + struct list_head *queue = array->queue + rt_se_prio(rt_se); - list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + if (on_rt_rq(rt_se)) + list_move_tail(&rt_se->run_list, queue); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index a38878e..80179ef 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -198,6 +198,9 @@ static inline void sched_info_queued(struct task_struct *t) /* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily. Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. */ static inline void sched_info_depart(struct task_struct *t) { @@ -206,6 +209,9 @@ static inline void sched_info_depart(struct task_struct *t) t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(t); } /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 01b6522..c828c23 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -49,12 +49,17 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -void touch_softlockup_watchdog(void) +static void __touch_softlockup_watchdog(void) { int this_cpu = raw_smp_processor_id(); __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); } + +void touch_softlockup_watchdog(void) +{ + __raw_get_cpu_var(touch_timestamp) = 0; +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -80,7 +85,7 @@ void softlockup_tick(void) unsigned long now; if (touch_timestamp == 0) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -95,7 +100,7 @@ void softlockup_tick(void) /* do not print during early bootup: */ if (unlikely(system_state != SYSTEM_RUNNING)) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -214,7 +219,7 @@ static int watchdog(void *__bind_cpu) sched_setscheduler(current, SCHED_FIFO, ¶m); /* initialize timestamp */ - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); set_current_state(TASK_INTERRUPTIBLE); /* @@ -223,7 +228,7 @@ static int watchdog(void *__bind_cpu) * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) |