diff options
Diffstat (limited to 'kernel')
88 files changed, 5609 insertions, 2488 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index e495624..9395003 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -74,6 +74,8 @@ static int audit_initialized; int audit_enabled; int audit_ever_enabled; +EXPORT_SYMBOL_GPL(audit_enabled); + /* Default state when kernel boots without any parameters. */ static int audit_default; @@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; - loginuid = NETLINK_CB(skb).loginuid; - sessionid = NETLINK_CB(skb).sessionid; - sid = NETLINK_CB(skb).sid; + loginuid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c78..e683869 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) +static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = ndp->path.dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct audit_parent *parent; int ret; @@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } /* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata *ndparent, *ndwatch; + struct nameidata nd; + struct dentry *d; int err; - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; + err = kern_path_parent(watch->path, &nd); + if (err) + return err; - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return -EINVAL; } - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return PTR_ERR(d); } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; + if (d->d_inode) { + /* update watch filter fields */ + watch->dev = d->d_inode->i_sb->s_dev; + watch->ino = d->d_inode->i_ino; } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - *ndp = ndparent; - *ndw = ndwatch; - + *parent = nd.path; + dput(d); return 0; } -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - /* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, @@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; struct audit_parent *parent; - struct nameidata *ndp = NULL, *ndw = NULL; + struct path parent_path; int h, ret = 0; mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ - ret = audit_get_nd(watch->path, &ndp, &ndw); - if (ret) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - goto error; - } + ret = audit_get_nd(watch, &parent_path); + /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } + if (ret) + return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(ndp->path.dentry->d_inode); + parent = audit_find_parent(parent_path.dentry->d_inode); if (!parent) { - parent = audit_init_parent(ndp); + parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { ret = PTR_ERR(parent); goto error; @@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ + path_put(&parent_path); return ret; - } void audit_remove_watch_rule(struct audit_krule *krule) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index add2819..f8277c8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; int result = 0; + u32 sid; switch (f->type) { case AUDIT_PID: @@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, result = audit_comparator(cb->creds.gid, f->op, f->val); break; case AUDIT_LOGINUID: - result = audit_comparator(cb->loginuid, f->op, f->val); + result = audit_comparator(audit_get_loginuid(current), + f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: - if (f->lsm_rule) - result = security_audit_rule_match(cb->sid, + if (f->lsm_rule) { + security_task_getsecid(current, &sid); + result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule, NULL); + } break; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d702..95362d1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - int i; struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } + int i; /* * Unlink from the css_set task list if necessary. @@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; + + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } + } task_unlock(tsk); + if (cg) put_css_set_taskexit(cg); } @@ -4813,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0..38b1d2c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; } +static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +{ + memset(txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc->modes, &utp->modes) || + __get_user(txc->offset, &utp->offset) || + __get_user(txc->freq, &utp->freq) || + __get_user(txc->maxerror, &utp->maxerror) || + __get_user(txc->esterror, &utp->esterror) || + __get_user(txc->status, &utp->status) || + __get_user(txc->constant, &utp->constant) || + __get_user(txc->precision, &utp->precision) || + __get_user(txc->tolerance, &utp->tolerance) || + __get_user(txc->time.tv_sec, &utp->time.tv_sec) || + __get_user(txc->time.tv_usec, &utp->time.tv_usec) || + __get_user(txc->tick, &utp->tick) || + __get_user(txc->ppsfreq, &utp->ppsfreq) || + __get_user(txc->jitter, &utp->jitter) || + __get_user(txc->shift, &utp->shift) || + __get_user(txc->stabil, &utp->stabil) || + __get_user(txc->jitcnt, &utp->jitcnt) || + __get_user(txc->calcnt, &utp->calcnt) || + __get_user(txc->errcnt, &utp->errcnt) || + __get_user(txc->stbcnt, &utp->stbcnt)) + return -EFAULT; + + return 0; +} + +static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) +{ + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc->modes, &utp->modes) || + __put_user(txc->offset, &utp->offset) || + __put_user(txc->freq, &utp->freq) || + __put_user(txc->maxerror, &utp->maxerror) || + __put_user(txc->esterror, &utp->esterror) || + __put_user(txc->status, &utp->status) || + __put_user(txc->constant, &utp->constant) || + __put_user(txc->precision, &utp->precision) || + __put_user(txc->tolerance, &utp->tolerance) || + __put_user(txc->time.tv_sec, &utp->time.tv_sec) || + __put_user(txc->time.tv_usec, &utp->time.tv_usec) || + __put_user(txc->tick, &utp->tick) || + __put_user(txc->ppsfreq, &utp->ppsfreq) || + __put_user(txc->jitter, &utp->jitter) || + __put_user(txc->shift, &utp->shift) || + __put_user(txc->stabil, &utp->stabil) || + __put_user(txc->jitcnt, &utp->jitcnt) || + __put_user(txc->calcnt, &utp->calcnt) || + __put_user(txc->errcnt, &utp->errcnt) || + __put_user(txc->stbcnt, &utp->stbcnt) || + __put_user(txc->tai, &utp->tai)) + return -EFAULT; + return 0; +} + asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) { @@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, return err; } +long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *utp) +{ + struct timex txc; + mm_segment_t oldfs; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); + set_fs(oldfs); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} + long compat_sys_clock_getres(clockid_t which_clock, struct compat_timespec __user *tp) { @@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; - int ret; - - memset(&txc, 0, sizeof(struct timex)); + int err, ret; - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc.modes, &utp->modes) || - __get_user(txc.offset, &utp->offset) || - __get_user(txc.freq, &utp->freq) || - __get_user(txc.maxerror, &utp->maxerror) || - __get_user(txc.esterror, &utp->esterror) || - __get_user(txc.status, &utp->status) || - __get_user(txc.constant, &utp->constant) || - __get_user(txc.precision, &utp->precision) || - __get_user(txc.tolerance, &utp->tolerance) || - __get_user(txc.time.tv_sec, &utp->time.tv_sec) || - __get_user(txc.time.tv_usec, &utp->time.tv_usec) || - __get_user(txc.tick, &utp->tick) || - __get_user(txc.ppsfreq, &utp->ppsfreq) || - __get_user(txc.jitter, &utp->jitter) || - __get_user(txc.shift, &utp->shift) || - __get_user(txc.stabil, &utp->stabil) || - __get_user(txc.jitcnt, &utp->jitcnt) || - __get_user(txc.calcnt, &utp->calcnt) || - __get_user(txc.errcnt, &utp->errcnt) || - __get_user(txc.stbcnt, &utp->stbcnt)) - return -EFAULT; + err = compat_get_timex(&txc, utp); + if (err) + return err; ret = do_adjtimex(&txc); - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc.modes, &utp->modes) || - __put_user(txc.offset, &utp->offset) || - __put_user(txc.freq, &utp->freq) || - __put_user(txc.maxerror, &utp->maxerror) || - __put_user(txc.esterror, &utp->esterror) || - __put_user(txc.status, &utp->status) || - __put_user(txc.constant, &utp->constant) || - __put_user(txc.precision, &utp->precision) || - __put_user(txc.tolerance, &utp->tolerance) || - __put_user(txc.time.tv_sec, &utp->time.tv_sec) || - __put_user(txc.time.tv_usec, &utp->time.tv_usec) || - __put_user(txc.tick, &utp->tick) || - __put_user(txc.ppsfreq, &utp->ppsfreq) || - __put_user(txc.jitter, &utp->jitter) || - __put_user(txc.shift, &utp->shift) || - __put_user(txc.stabil, &utp->stabil) || - __put_user(txc.jitcnt, &utp->jitcnt) || - __put_user(txc.calcnt, &utp->calcnt) || - __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt) || - __put_user(txc.tai, &utp->tai)) - ret = -EFAULT; + err = compat_put_timex(utp, &txc); + if (err) + return err; return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4349935..e92e981 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; trialcs = alloc_trial_cpuset(cs); - if (!trialcs) - return -ENOMEM; + if (!trialcs) { + retval = -ENOMEM; + goto out; + } switch (cft->private) { case FILE_CPULIST: @@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); +out: cgroup_unlock(); return retval; } diff --git a/kernel/cred.c b/kernel/cred.c index 3a9d6dd..2343c132 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; static struct thread_group_cred init_tgcred = { .usage = ATOMIC_INIT(2), .tgid = 0, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), }; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 25e4291..05b92c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround diff --git a/kernel/futex.c b/kernel/futex.c index b766d28..bda4157 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, return NULL; } -static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) { - u32 curval; + int ret; pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); - return curval; + return ret; } static int get_futex_value_locked(u32 *dest, u32 __user *from) @@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; - u32 uval, newval, curval; + u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: ret = lock_taken = 0; @@ -684,19 +685,17 @@ retry: * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = task_pid_vnr(task); + newval = vpid; if (set_waiters) newval |= FUTEX_WAITERS; - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) return -EFAULT; /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* @@ -723,14 +722,12 @@ retry: */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + newval = (curval & ~FUTEX_TID_MASK) | vpid; ownerdied = 0; lock_taken = 1; } - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; if (unlikely(curval != uval)) goto retry; @@ -775,6 +772,24 @@ retry: return ret; } +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) + || plist_node_empty(&q->list))) + return; + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q) */ get_task_struct(p); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as * q->lock_ptr = NULL is written, without taking any locks. A @@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) ret = -EINVAL; @@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); - - if (oldval == -EFAULT) - return oldval; + if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) + return -EFAULT; if (oldval != uval) return -EAGAIN; @@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_del(&q->list, &hb1->chain); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb2->lock; -#endif } get_futex_key_refs(key2); q->key = *key2; @@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, get_futex_key_refs(key); q->key = *key; - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; q->lock_ptr = &hb->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif wake_up_state(q->task, TASK_NORMAL); } @@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); @@ -1504,8 +1505,7 @@ retry: spin_unlock(lock_ptr); goto retry; } - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(q->pi_state); @@ -1525,8 +1525,7 @@ retry: static void unqueue_me_pi(struct futex_q *q) __releases(q->lock_ptr) { - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); @@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the - * pending owner or we are the pending owner which failed to - * get the rtmutex. We have to replace the pending owner TID - * in the user space variable. This must be atomic as we have - * to preserve the owner died bit here. + * previous highest priority waiter or we are the highest priority + * waiter but failed to get the rtmutex the first time. + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state * because we can fault here. Imagine swapped out pages or a fork @@ -1578,9 +1577,7 @@ retry: while (1) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) goto handle_fault; if (curval == uval) break; @@ -1608,8 +1605,8 @@ retry: /* * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the pending - * owner itself or the task which stole the rtmutex) the + * lock here. That gives the other task (either the highest priority + * waiter itself or the task which stole the rtmutex) the * chance to try the fixup of the pi_state. So once we are * back from handling the fault we need to check the pi_state * after reacquiring the hash bucket lock and before trying to @@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * pi_state is incorrect, some other task did a lock steal and * we returned due to timeout or signal without taking the - * rt_mutex. Too late. We can access the rt_mutex_owner without - * locking, as the other task is now blocked on the hash bucket - * lock. Fix the state up. + * rt_mutex. Too late. */ + raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); + if (!owner) + owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); + raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } /* * Paranoia check. If we did not take the lock, then we should not be - * the owner, nor the pending owner, of the rt_mutex. + * the owner of the rt_mutex. */ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " @@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); @@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + u32 uval, vpid = task_pid_vnr(current); int ret; retry: @@ -2057,7 +2057,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) + if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); @@ -2072,17 +2072,14 @@ retry: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); - - - if (unlikely(uval == -EFAULT)) + if (!(uval & FUTEX_OWNER_DIED) && + cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == task_pid_vnr(current))) + if (unlikely(uval == vpid)) goto out_unlock; /* @@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &q->list.plist); + plist_del(&q->list, &hb->chain); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2463,11 +2460,20 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) - return -1; - + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + } if (nval != uval) goto retry; @@ -2678,8 +2684,7 @@ static int __init futex_init(void) * implementation, the non-functional ones will return * -ENOSYS. */ - curval = cmpxchg_futex_value_locked(NULL, 0, 0); - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 70a298d..b8cadf7 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c0..9017478 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -53,11 +53,10 @@ /* * The timer bases: * - * Note: If we want to add new timer bases, we have to skip the two - * clock ids captured by the cpu-timers. We do this by holding empty - * entries rather than doing math adjustment of the clock ids. - * This ensures that we capture erroneous accesses to these clock ids - * rather than moving them into the range of valid clock id's. + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { @@ -74,30 +73,39 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get, .resolution = KTIME_LOW_RES, }, + { + .index = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + .resolution = KTIME_LOW_RES, + }, } }; +static int hrtimer_clock_to_base_table[MAX_CLOCKS]; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, tomono; - struct timespec xts, tom; - unsigned long seq; + ktime_t xtim, mono, boot; + struct timespec xts, tom, slp; - do { - seq = read_seqbegin(&xtime_lock); - xts = __current_kernel_time(); - tom = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); xtim = timespec_to_ktime(xts); - tomono = timespec_to_ktime(tom); - base->clock_base[CLOCK_REALTIME].softirq_time = xtim; - base->clock_base[CLOCK_MONOTONIC].softirq_time = - ktime_add(xtim, tomono); + mono = ktime_add(xtim, timespec_to_ktime(tom)); + boot = ktime_add(mono, timespec_to_ktime(slp)); + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; } /* @@ -184,10 +192,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); int cpu = hrtimer_get_target(this_cpu, pinned); + int basenum = hrtimer_clockid_to_base(base->index); again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); - new_base = &new_cpu_base->clock_base[base->index]; + new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* @@ -334,6 +343,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); static struct debug_obj_descr hrtimer_debug_descr; +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -393,6 +407,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, @@ -611,24 +626,23 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset, wtm; - unsigned long seq; + struct timespec realtime_offset, wtm, sleep; if (!hrtimer_hres_active()) return; - do { - seq = read_seqbegin(&xtime_lock); - wtm = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, + &sleep); set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[CLOCK_REALTIME].offset = + base->clock_base[HRTIMER_BASE_REALTIME].offset = timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); @@ -673,14 +687,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) } /* - * Initialize the high resolution related parts of a hrtimer - */ -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) -{ -} - - -/* * When High resolution timers are active, try to reprogram. Note, that in case * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry * check happens. The timer gets enqueued into the rbtree. The reprogramming @@ -725,8 +731,9 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -750,7 +757,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1121,6 +1127,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { struct hrtimer_cpu_base *cpu_base; + int base; memset(timer, 0, sizeof(struct hrtimer)); @@ -1129,8 +1136,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &cpu_base->clock_base[clock_id]; - hrtimer_init_timer_hres(timer); + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS @@ -1165,9 +1172,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; } @@ -1714,6 +1722,10 @@ static struct notifier_block __cpuinitdata hrtimers_nb = { void __init hrtimers_init(void) { + hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; + hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; + hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME; + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8e42fec..09bef82 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,5 +1,6 @@ +# Select this to activate the generic irq options below config HAVE_GENERIC_HARDIRQS - def_bool n + bool if HAVE_GENERIC_HARDIRQS menu "IRQ subsystem" @@ -11,26 +12,44 @@ config GENERIC_HARDIRQS # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED - def_bool n + bool + +config GENERIC_HARDIRQS_NO_COMPAT + bool # Options selectable by the architecture code + +# Make sparse irq Kconfig switch below available config HAVE_SPARSE_IRQ - def_bool n + bool +# Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE - def_bool n + bool + +# Use the generic /proc/interrupts implementation +config GENERIC_IRQ_SHOW + bool +# Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ - def_bool n + bool +# Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY - def_bool n - -config IRQ_PER_CPU - def_bool n + bool +# Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND - def_bool n + bool + +# Preflow handler support for fasteoi (sparc64) +config IRQ_PREFLOW_FASTEOI + bool + +# Support forced irq threading +config IRQ_FORCED_THREADING + bool config SPARSE_IRQ bool "Support sparse irq numbering" diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 505798f..394784c 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -17,7 +17,7 @@ /* * Autodetection depends on the fact that any interrupt that * comes in on to an unassigned handler will get stuck with - * "IRQ_WAITING" cleared and the interrupt disabled. + * "IRQS_WAITING" cleared and the interrupt disabled. */ static DEFINE_MUTEX(probing_active); @@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) { struct irq_desc *desc; unsigned long mask = 0; - unsigned int status; int i; /* @@ -46,13 +45,7 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - /* - * An old-style architecture might still have - * the handle_bad_irq handler there: - */ - compat_irq_chip_set_default_handler(desc); - + if (!desc->action && irq_settings_can_probe(desc)) { /* * Some chips need to know about probing in * progress: @@ -60,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_startup(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -75,10 +68,12 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->irq_data.chip->irq_startup(&desc->irq_data)) - desc->status |= IRQ_PENDING; + if (!desc->action && irq_settings_can_probe(desc)) { + desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; + if (irq_startup(desc)) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + } } raw_spin_unlock_irq(&desc->lock); } @@ -93,13 +88,12 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { + if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ - if (!(status & IRQ_WAITING)) { - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (!(desc->istate & IRQS_WAITING)) { + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } else if (i < 32) mask |= 1 << i; @@ -125,20 +119,18 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int status, mask = 0; + unsigned int mask = 0; struct irq_desc *desc; int i; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (i < 16 && !(status & IRQ_WAITING)) + if (desc->istate & IRQS_AUTODETECT) { + if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -169,20 +161,18 @@ int probe_irq_off(unsigned long val) { int i, irq_found = 0, nr_of_irqs = 0; struct irq_desc *desc; - unsigned int status; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { - if (!(status & IRQ_WAITING)) { + if (desc->istate & IRQS_AUTODETECT) { + if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) irq_found = i; nr_of_irqs++; } - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index baa5c4a..c9c0601 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -19,140 +19,110 @@ #include "internals.h" /** - * set_irq_chip - set the irq chip for an irq + * irq_set_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure */ -int set_irq_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } if (!chip) chip = &no_irq_chip; - raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->irq_data.chip = chip; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_chip); +EXPORT_SYMBOL(irq_set_chip); /** - * set_irq_type - set the irq trigger type for an irq + * irq_set_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ -int set_irq_type(unsigned int irq, unsigned int type) +int irq_set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - int ret = -ENXIO; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + int ret = 0; - if (!desc) { - printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); - return -ENODEV; - } + if (!desc) + return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, type); - raw_spin_unlock_irqrestore(&desc->lock, flags); + if (type != IRQ_TYPE_NONE) + ret = __irq_set_trigger(desc, irq, type); + irq_put_desc_busunlock(desc, flags); return ret; } -EXPORT_SYMBOL(set_irq_type); +EXPORT_SYMBOL(irq_set_irq_type); /** - * set_irq_data - set irq type data for an irq + * irq_set_handler_data - set irq handler data for an irq * @irq: Interrupt number * @data: Pointer to interrupt specific data * * Set the hardware irq controller data for an irq */ -int set_irq_data(unsigned int irq, void *data) +int irq_set_handler_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install controller data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.handler_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_data); +EXPORT_SYMBOL(irq_set_handler_data); /** - * set_irq_msi - set MSI descriptor data for an irq + * irq_set_msi_desc - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq */ -int set_irq_msi(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install msi data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } /** - * set_irq_chip_data - set irq chip data for an irq + * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data * * Set the hardware irq chip data for an irq */ -int set_irq_chip_data(unsigned int irq, void *data) +int irq_set_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install chip data for IRQ%d\n", irq); - return -EINVAL; - } - - if (!desc->irq_data.chip) { - printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.chip_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_chip_data); +EXPORT_SYMBOL(irq_set_chip_data); struct irq_data *irq_get_irq_data(unsigned int irq) { @@ -162,72 +132,75 @@ struct irq_data *irq_get_irq_data(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_get_irq_data); -/** - * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq - * - * @irq: Interrupt number - * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag - * - * The IRQ_NESTED_THREAD flag indicates that on - * request_threaded_irq() no separate interrupt thread should be - * created for the irq as the handler are called nested in the - * context of a demultiplexing interrupt handler thread. - */ -void set_irq_nested_thread(unsigned int irq, int nest) +static void irq_state_clr_disabled(struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - if (nest) - desc->status |= IRQ_NESTED_THREAD; - else - desc->status &= ~IRQ_NESTED_THREAD; - raw_spin_unlock_irqrestore(&desc->lock, flags); + desc->istate &= ~IRQS_DISABLED; + irq_compat_clr_disabled(desc); } -EXPORT_SYMBOL_GPL(set_irq_nested_thread); -/* - * default enable function - */ -static void default_enable(struct irq_data *data) +static void irq_state_set_disabled(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); + desc->istate |= IRQS_DISABLED; + irq_compat_set_disabled(desc); +} - desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; +static void irq_state_clr_masked(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_MASKED; + irq_compat_clr_masked(desc); } -/* - * default disable function - */ -static void default_disable(struct irq_data *data) +static void irq_state_set_masked(struct irq_desc *desc) { + desc->istate |= IRQS_MASKED; + irq_compat_set_masked(desc); } -/* - * default startup function - */ -static unsigned int default_startup(struct irq_data *data) +int irq_startup(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); + irq_state_clr_disabled(desc); + desc->depth = 0; + + if (desc->irq_data.chip->irq_startup) { + int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_state_clr_masked(desc); + return ret; + } - desc->irq_data.chip->irq_enable(data); + irq_enable(desc); return 0; } -/* - * default shutdown function - */ -static void default_shutdown(struct irq_data *data) +void irq_shutdown(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); + irq_state_set_disabled(desc); + desc->depth = 1; + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_state_set_masked(desc); +} - desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; +void irq_enable(struct irq_desc *desc) +{ + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); +} + +void irq_disable(struct irq_desc *desc) +{ + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -315,10 +288,6 @@ static void compat_bus_sync_unlock(struct irq_data *data) void irq_chip_set_defaults(struct irq_chip *chip) { #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - /* - * Compat fixup functions need to be before we set the - * defaults for enable/disable/startup/shutdown - */ if (chip->enable) chip->irq_enable = compat_irq_enable; if (chip->disable) @@ -327,33 +296,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_shutdown = compat_irq_shutdown; if (chip->startup) chip->irq_startup = compat_irq_startup; -#endif - /* - * The real defaults - */ - if (!chip->irq_enable) - chip->irq_enable = default_enable; - if (!chip->irq_disable) - chip->irq_disable = default_disable; - if (!chip->irq_startup) - chip->irq_startup = default_startup; - /* - * We use chip->irq_disable, when the user provided its own. When - * we have default_disable set for chip->irq_disable, then we need - * to use default_shutdown, otherwise the irq line is not - * disabled on free_irq(): - */ - if (!chip->irq_shutdown) - chip->irq_shutdown = chip->irq_disable != default_disable ? - chip->irq_disable : default_shutdown; - -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; - - /* - * Now fix up the remaining compat handlers - */ if (chip->bus_lock) chip->irq_bus_lock = compat_bus_lock; if (chip->bus_sync_unlock) @@ -388,22 +332,22 @@ static inline void mask_ack_irq(struct irq_desc *desc) if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } -static inline void mask_irq(struct irq_desc *desc) +void mask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } } -static inline void unmask_irq(struct irq_desc *desc) +void unmask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); } } @@ -428,10 +372,11 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->istate & IRQS_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -439,13 +384,21 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); } EXPORT_SYMBOL_GPL(handle_nested_irq); +static bool irq_check_poll(struct irq_desc *desc) +{ + if (!(desc->istate & IRQS_POLL_INPROGRESS)) + return false; + return irq_wait_for_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -461,29 +414,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); void handle_simple_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); + handle_irq_event(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } @@ -501,42 +445,42 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available * keep it masked and get out of here */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + handle_irq_event(desc); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; - - if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void preflow_handler(struct irq_desc *desc) +{ + if (desc->preflow_handler) + desc->preflow_handler(&desc->irq_data); +} +#else +static inline void preflow_handler(struct irq_desc *desc) { } +#endif + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -550,42 +494,41 @@ EXPORT_SYMBOL_GPL(handle_level_irq); void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; + if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (!irq_check_poll(desc)) + goto out; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available * then mask it and get out of here: */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { - desc->status |= IRQ_PENDING; + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } - desc->status |= IRQ_INPROGRESS; - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + preflow_handler(desc); + handle_irq_event(desc); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; -out: +out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); - +out_unlock: raw_spin_unlock(&desc->lock); + return; +out: + if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) + goto out_eoi; + goto out_unlock; } /** @@ -609,32 +552,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If we're currently running this IRQ, or its disabled, * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || - !desc->action)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc); - goto out_unlock; + if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || + !desc->action))) { + if (!irq_check_poll(desc)) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; + } } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - /* Mark the IRQ currently in progress.*/ - desc->status |= IRQ_INPROGRESS; - do { - struct irqaction *action = desc->action; - irqreturn_t action_ret; - - if (unlikely(!action)) { + if (unlikely(!desc->action)) { mask_irq(desc); goto out_unlock; } @@ -644,22 +583,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ - if (unlikely((desc->status & - (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == - (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc); + if (unlikely(desc->istate & IRQS_PENDING)) { + if (!(desc->istate & IRQS_DISABLED) && + (desc->istate & IRQS_MASKED)) + unmask_irq(desc); } - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - raw_spin_lock(&desc->lock); + handle_irq_event(desc); - } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); + } while ((desc->istate & IRQS_PENDING) && + !(desc->istate & IRQS_DISABLED)); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } @@ -674,103 +608,84 @@ out_unlock: void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { - irqreturn_t action_ret; + struct irq_chip *chip = irq_desc_get_chip(desc); kstat_incr_irqs_this_cpu(irq, desc); - if (desc->irq_data.chip->irq_ack) - desc->irq_data.chip->irq_ack(&desc->irq_data); + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + handle_irq_event_percpu(desc, desc->action); - if (desc->irq_data.chip->irq_eoi) - desc->irq_data.chip->irq_eoi(&desc->irq_data); + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); } void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install type control for IRQ%d\n", irq); + if (!desc) return; - } - if (!handle) + if (!handle) { handle = handle_bad_irq; - else if (desc->irq_data.chip == &no_irq_chip) { - printk(KERN_WARNING "Trying to install %sinterrupt handler " - "for IRQ%d\n", is_chained ? "chained " : "", irq); - /* - * Some ARM implementations install a handler for really dumb - * interrupt hardware without setting an irq_chip. This worked - * with the ARM no_irq_chip but the check in setup_irq would - * prevent us to setup the interrupt at all. Switch it to - * dummy_irq_chip for easy transition. - */ - desc->irq_data.chip = &dummy_irq_chip; + } else { + if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + goto out; } - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - desc->status |= IRQ_DISABLED; + irq_compat_set_disabled(desc); + desc->istate |= IRQS_DISABLED; desc->depth = 1; } desc->handle_irq = handle; desc->name = name; if (handle != handle_bad_irq && is_chained) { - desc->status &= ~IRQ_DISABLED; - desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; - desc->depth = 0; - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_settings_set_noprobe(desc); + irq_settings_set_norequest(desc); + irq_startup(desc); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); -} -EXPORT_SYMBOL_GPL(__set_irq_handler); - -void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle) -{ - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, NULL); +out: + irq_put_desc_busunlock(desc, flags); } +EXPORT_SYMBOL_GPL(__irq_set_handler); void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, name); + irq_set_chip(irq, chip); + __irq_set_handler(irq, handle, 0, name); } void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); if (!desc) return; + irq_settings_clr_and_set(desc, clr, set); + + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_can_move_pcntxt(desc)) + irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); - /* Sanitize flags */ - set &= IRQF_MODIFY_MASK; - clr &= IRQF_MODIFY_MASK; + irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~clr; - desc->status |= set; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); } diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h new file mode 100644 index 0000000..6bbaf66 --- /dev/null +++ b/kernel/irq/compat.h @@ -0,0 +1,72 @@ +/* + * Compat layer for transition period + */ +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +static inline void irq_compat_set_progress(struct irq_desc *desc) +{ + desc->status |= IRQ_INPROGRESS; +} + +static inline void irq_compat_clr_progress(struct irq_desc *desc) +{ + desc->status &= ~IRQ_INPROGRESS; +} +static inline void irq_compat_set_disabled(struct irq_desc *desc) +{ + desc->status |= IRQ_DISABLED; +} +static inline void irq_compat_clr_disabled(struct irq_desc *desc) +{ + desc->status &= ~IRQ_DISABLED; +} +static inline void irq_compat_set_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_PENDING; +} + +static inline void irq_compat_clr_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_PENDING; +} +static inline void irq_compat_set_masked(struct irq_desc *desc) +{ + desc->status |= IRQ_MASKED; +} + +static inline void irq_compat_clr_masked(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MASKED; +} +static inline void irq_compat_set_move_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_MOVE_PENDING; +} + +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MOVE_PENDING; +} +static inline void irq_compat_set_affinity(struct irq_desc *desc) +{ + desc->status |= IRQ_AFFINITY_SET; +} + +static inline void irq_compat_clr_affinity(struct irq_desc *desc) +{ + desc->status &= ~IRQ_AFFINITY_SET; +} +#else +static inline void irq_compat_set_progress(struct irq_desc *desc) { } +static inline void irq_compat_clr_progress(struct irq_desc *desc) { } +static inline void irq_compat_set_disabled(struct irq_desc *desc) { } +static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } +static inline void irq_compat_set_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_masked(struct irq_desc *desc) { } +static inline void irq_compat_clr_masked(struct irq_desc *desc) { } +static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_affinity(struct irq_desc *desc) { } +static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } +#endif + diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 0000000..d1a33b7 --- /dev/null +++ b/kernel/irq/debug.h @@ -0,0 +1,40 @@ +/* + * Debugging printout: + */ + +#include <linux/kallsyms.h> + +#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_LEVEL); + P(IRQ_PER_CPU); + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); + PS(IRQS_INPROGRESS); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); + PS(IRQS_DISABLED); + PS(IRQS_PENDING); + PS(IRQS_MASKED); +} + +#undef P +#undef PS diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a71..517561f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +{ + /* + * Wake up the handler thread for this action. In case the + * thread crashed and was killed we just pretend that we + * handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. If the + * RUNTHREAD bit is already set, nothing to do. + */ + if (test_bit(IRQTF_DIED, &action->thread_flags) || + test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + return; + + /* + * It's safe to OR the mask lockless here. We have only two + * places which write to threads_oneshot: This code and the + * irq thread. + * + * This code is the hard irq context and can never run on two + * cpus in parallel. If it ever does we have more serious + * problems than this bitmask. + * + * The irq threads of this irq which clear their "running" bit + * in threads_oneshot are serialized via desc->lock against + * each other and they are serialized against this code by + * IRQS_INPROGRESS. + * + * Hard irq handler: + * + * spin_lock(desc->lock); + * desc->state |= IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + * desc->threads_oneshot |= mask; + * spin_lock(desc->lock); + * desc->state &= ~IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * + * irq thread: + * + * again: + * spin_lock(desc->lock); + * if (desc->state & IRQS_INPROGRESS) { + * spin_unlock(desc->lock); + * while(desc->state & IRQS_INPROGRESS) + * cpu_relax(); + * goto again; + * } + * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + * desc->threads_oneshot &= ~mask; + * spin_unlock(desc->lock); + * + * So either the thread waits for us to clear IRQS_INPROGRESS + * or we are waiting in the flow handler for desc->lock to be + * released before we reach this point. The thread also checks + * IRQTF_RUNTHREAD under desc->lock. If set it leaves + * threads_oneshot untouched and runs the thread another time. + */ + desc->threads_oneshot |= action->thread_mask; + wake_up_process(action->thread); +} + +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { - irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0; + irqreturn_t retval = IRQ_NONE; + unsigned int random = 0, irq = desc->irq_data.irq; do { + irqreturn_t res; + trace_irq_handler_entry(irq, action); - ret = action->handler(irq, action->dev_id); - trace_irq_handler_exit(irq, action, ret); + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); - switch (ret) { + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + irq, action->handler)) + local_irq_disable(); + + switch (res) { case IRQ_WAKE_THREAD: /* * Set result to handled so the spurious check * does not trigger. */ - ret = IRQ_HANDLED; + res = IRQ_HANDLED; /* * Catch drivers which return WAKE_THREAD but @@ -85,36 +147,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) break; } - /* - * Wake up the handler thread for this - * action. In case the thread crashed and was - * killed we just pretend that we handled the - * interrupt. The hardirq handler above has - * disabled the device interrupt, so no irq - * storm is lurking. - */ - if (likely(!test_bit(IRQTF_DIED, - &action->thread_flags))) { - set_bit(IRQTF_RUNTHREAD, &action->thread_flags); - wake_up_process(action->thread); - } + irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: - status |= action->flags; + random |= action->flags; break; default: break; } - retval |= ret; + retval |= res; action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (random & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); - local_irq_disable(); + if (!noirqdebug) + note_interrupt(irq, desc, retval); return retval; } + +irqreturn_t handle_irq_event(struct irq_desc *desc) +{ + struct irqaction *action = desc->action; + irqreturn_t ret; + + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; + raw_spin_unlock(&desc->lock); + + ret = handle_irq_event_percpu(desc, action); + + raw_spin_lock(&desc->lock); + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); + return ret; +} + +/** + * handle_IRQ_event - irq action chain handler + * @irq: the interrupt number + * @action: the interrupt action chain for this irq + * + * Handles the action chain of an irq event + */ +irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +{ + return handle_irq_event_percpu(irq_to_desc(irq), action); +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4571ae7..6c6ec9a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,27 +1,101 @@ /* * IRQ subsystem internal functions and variables: + * + * Do not ever include this file from anything else than + * kernel/irq/. Do not even think about using any information outside + * of this file for your non core code. */ #include <linux/irqdesc.h> +#ifdef CONFIG_SPARSE_IRQ +# define IRQ_BITMAP_BITS (NR_IRQS + 8196) +#else +# define IRQ_BITMAP_BITS NR_IRQS +#endif + +#define istate core_internal_state__do_not_mess_with_it + +#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +# define status status_use_accessors +#endif + extern int noirqdebug; +/* + * Bits used by threaded handlers: + * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run + * IRQTF_DIED - handler thread died + * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity + * IRQTF_FORCED_THREAD - irq action is force threaded + */ +enum { + IRQTF_RUNTHREAD, + IRQTF_DIED, + IRQTF_WARNED, + IRQTF_AFFINITY, + IRQTF_FORCED_THREAD, +}; + +/* + * Bit masks for desc->state + * + * IRQS_AUTODETECT - autodetection in progress + * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt + * detection + * IRQS_POLL_INPROGRESS - polling in progress + * IRQS_INPROGRESS - Interrupt in progress + * IRQS_ONESHOT - irq is not unmasked in primary handler + * IRQS_REPLAY - irq is replayed + * IRQS_WAITING - irq is waiting + * IRQS_DISABLED - irq is disabled + * IRQS_PENDING - irq is pending and replayed later + * IRQS_MASKED - irq is masked + * IRQS_SUSPENDED - irq is suspended + */ +enum { + IRQS_AUTODETECT = 0x00000001, + IRQS_SPURIOUS_DISABLED = 0x00000002, + IRQS_POLL_INPROGRESS = 0x00000008, + IRQS_INPROGRESS = 0x00000010, + IRQS_ONESHOT = 0x00000020, + IRQS_REPLAY = 0x00000040, + IRQS_WAITING = 0x00000080, + IRQS_DISABLED = 0x00000100, + IRQS_PENDING = 0x00000200, + IRQS_MASKED = 0x00000400, + IRQS_SUSPENDED = 0x00000800, +}; + +#include "compat.h" +#include "debug.h" +#include "settings.h" + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); -/* Set default handler: */ -extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern int irq_startup(struct irq_desc *desc); +extern void irq_shutdown(struct irq_desc *desc); +extern void irq_enable(struct irq_desc *desc); +extern void irq_disable(struct irq_desc *desc); +extern void mask_irq(struct irq_desc *desc); +extern void unmask_irq(struct irq_desc *desc); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event(struct irq_desc *desc); + /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); +bool irq_wait_for_poll(struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -37,20 +111,10 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif -extern int irq_select_affinity_usr(unsigned int irq); +extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static inline void irq_end(unsigned int irq, struct irq_desc *desc) -{ - if (desc->irq_data.chip && desc->irq_data.chip->end) - desc->irq_data.chip->end(irq); -} -#else -static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } -#endif - /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { @@ -64,43 +128,60 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); + +static inline struct irq_desc * +irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, true); +} + +static inline void +irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, true); +} + +static inline struct irq_desc * +irq_get_desc_lock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, false); +} + +static inline void +irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, false); +} + /* - * Debugging printout: + * Manipulation functions for irq_data.state */ +static inline void irqd_set_move_pending(struct irq_data *d) +{ + d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; + irq_compat_set_move_pending(irq_data_to_desc(d)); +} -#include <linux/kallsyms.h> - -#define P(f) if (desc->status & f) printk("%14s set\n", #f) +static inline void irqd_clr_move_pending(struct irq_data *d) +{ + d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; + irq_compat_clr_move_pending(irq_data_to_desc(d)); +} -static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +static inline void irqd_clear(struct irq_data *d, unsigned int mask) { - printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", - irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); - printk("->action(): %p\n", desc->action); - if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); - } - - P(IRQ_INPROGRESS); - P(IRQ_DISABLED); - P(IRQ_PENDING); - P(IRQ_REPLAY); - P(IRQ_AUTODETECT); - P(IRQ_WAITING); - P(IRQ_LEVEL); - P(IRQ_MASKED); -#ifdef CONFIG_IRQ_PER_CPU - P(IRQ_PER_CPU); -#endif - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOAUTOEN); + d->state_use_accessors &= ~mask; } -#undef P +static inline void irqd_set(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors |= mask; +} +static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) +{ + return d->state_use_accessors & mask; +} diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 282f202..dbccc79 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.chip_data = NULL; desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; - desc->status = IRQ_DEFAULT_INIT_FLAGS; + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); + desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -94,7 +95,7 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); -static DECLARE_BITMAP(allocated_irqs, NR_IRQS); +static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); #ifdef CONFIG_SPARSE_IRQ @@ -206,6 +207,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } +static int irq_expand_nr_irqs(unsigned int nr) +{ + if (nr > IRQ_BITMAP_BITS) + return -ENOMEM; + nr_irqs = nr; + return 0; +} + int __init early_irq_init(void) { int i, initcnt, node = first_online_node; @@ -217,6 +226,15 @@ int __init early_irq_init(void) initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) + nr_irqs = IRQ_BITMAP_BITS; + + if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) + initcnt = IRQ_BITMAP_BITS; + + if (initcnt > nr_irqs) + nr_irqs = initcnt; + for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node); set_bit(i, allocated_irqs); @@ -229,7 +247,7 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = IRQ_DEFAULT_INIT_FLAGS, + .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), @@ -251,8 +269,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - /* TODO : do this allocation on-demand ... */ desc[i].kstat_irqs = alloc_percpu(unsigned int); + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -277,24 +295,14 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { -#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) - struct irq_desc *desc; - unsigned int i; - - for (i = 0; i < cnt; i++) { - desc = irq_to_desc(start + i); - if (desc && !desc->kstat_irqs) { - unsigned int __percpu *stats = alloc_percpu(unsigned int); - - if (!stats) - return -1; - if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) - free_percpu(stats); - } - } -#endif return start; } + +static int irq_expand_nr_irqs(unsigned int nr) +{ + return -ENOMEM; +} + #endif /* !CONFIG_SPARSE_IRQ */ /* Dynamic interrupt handling */ @@ -338,14 +346,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, + from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) goto err; - ret = -ENOMEM; - if (start >= nr_irqs) - goto err; + if (start + cnt > nr_irqs) { + ret = irq_expand_nr_irqs(start + cnt); + if (ret) + goto err; + } bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); @@ -392,6 +403,26 @@ unsigned int irq_get_next_irq(unsigned int offset) return find_next_bit(allocated_irqs, nr_irqs, offset); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + } + return desc; +} + +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) +{ + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (bus) + chip_bus_sync_unlock(desc); +} + /** * dynamic_irq_cleanup - cleanup a dynamically allocated irq * @irq: irq number to initialize diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f..acd599a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,17 @@ #include "internals.h" +#ifdef CONFIG_IRQ_FORCED_THREADING +__read_mostly bool force_irqthreads; + +static int __init setup_forced_irqthreads(char *arg) +{ + force_irqthreads = true; + return 0; +} +early_param("threadirqs", setup_forced_irqthreads); +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -30,7 +41,7 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int status; + unsigned int state; if (!desc) return; @@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) + while (desc->istate & IRQS_INPROGRESS) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - status = desc->status; + state = desc->istate; raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (status & IRQ_INPROGRESS); + } while (state & IRQS_INPROGRESS); /* * We made sure that no hardirq handler is running. Now verify @@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || - !desc->irq_data.chip->irq_set_affinity) + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -100,67 +111,169 @@ void irq_set_thread_affinity(struct irq_desc *desc) } } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +{ + return irq_settings_can_move_pcntxt(desc); +} +static inline bool irq_move_pending(struct irq_desc *desc) +{ + return irqd_is_setaffinity_pending(&desc->irq_data); +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } +static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } +#endif + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; + int ret = 0; if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); + if (irq_can_move_pcntxt(desc)) { + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); + ret = 0; } + } else { + irqd_set_move_pending(&desc->irq_data); + irq_copy_pending(desc, mask); } - else { - desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(desc->pending_mask, cpumask); - } -#else - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); - irq_set_thread_affinity(desc); + + if (desc->affinity_notify) { + kref_get(&desc->affinity_notify->kref); + schedule_work(&desc->affinity_notify->work); } -#endif - desc->status |= IRQ_AFFINITY_SET; + irq_compat_set_affinity(desc); + irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; + return ret; } int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + desc->affinity_hint = m; + irq_put_desc_unlock(desc, flags); + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_hint); + +static void irq_affinity_notify(struct work_struct *work) +{ + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); + struct irq_desc *desc = irq_to_desc(notify->irq); + cpumask_var_t cpumask; + unsigned long flags; + + if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (irq_move_pending(desc)) + irq_get_pending(cpumask, desc); + else + cpumask_copy(cpumask, desc->irq_data.affinity); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + notify->notify(notify, cpumask); + + free_cpumask_var(cpumask); +out: + kref_put(¬ify->kref, notify->release); +} + +/** + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is + * freed using free_irq(). + */ +int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +{ struct irq_desc *desc = irq_to_desc(irq); + struct irq_affinity_notify *old_notify; unsigned long flags; + /* The release function is promised process context */ + might_sleep(); + if (!desc) return -EINVAL; + /* Complete initialisation of *notify */ + if (notify) { + notify->irq = irq; + kref_init(¬ify->kref); + INIT_WORK(¬ify->work, irq_affinity_notify); + } + raw_spin_lock_irqsave(&desc->lock, flags); - desc->affinity_hint = m; + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); + if (old_notify) + kref_put(&old_notify->kref, old_notify->release); + return 0; } -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ -static int setup_affinity(unsigned int irq, struct irq_desc *desc) +static int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { + struct irq_chip *chip = irq_desc_get_chip(desc); + struct cpumask *set = irq_default_affinity; + int ret; + + /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) return 0; @@ -168,22 +281,29 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) - < nr_cpu_ids) - goto set_affinity; - else - desc->status &= ~IRQ_AFFINITY_SET; + if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (cpumask_intersects(desc->irq_data.affinity, + cpu_online_mask)) + set = desc->irq_data.affinity; + else { + irq_compat_clr_affinity(desc); + irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); + } } - cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); -set_affinity: - desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); - + cpumask_and(mask, cpu_online_mask, set); + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + } return 0; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *d) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) { return irq_select_affinity(irq); } @@ -192,23 +312,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) /* * Called when affinity is set via /proc/irq */ -int irq_select_affinity_usr(unsigned int irq) +int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc); - if (!ret) - irq_set_thread_affinity(desc); + ret = setup_affinity(irq, desc, mask); raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { return 0; } @@ -219,13 +337,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (suspend) { if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; - desc->status |= IRQ_SUSPENDED; + desc->istate |= IRQS_SUSPENDED; } - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; - desc->irq_data.chip->irq_disable(&desc->irq_data); - } + if (!desc->depth++) + irq_disable(desc); +} + +static int __disable_irq_nosync(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return -EINVAL; + __disable_irq(desc, irq, false); + irq_put_desc_busunlock(desc, flags); + return 0; } /** @@ -241,17 +369,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); @@ -269,21 +387,24 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc) - return; - - disable_irq_nosync(irq); - if (desc->action) + if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { - if (resume) - desc->status &= ~IRQ_SUSPENDED; + if (resume) { + if (!(desc->istate & IRQS_SUSPENDED)) { + if (!desc->action) + return; + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + /* Pretend that it got disabled ! */ + desc->depth++; + } + desc->istate &= ~IRQS_SUSPENDED; + } switch (desc->depth) { case 0: @@ -291,12 +412,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ - desc->status = status | IRQ_NOPROBE; + irq_settings_set_noprobe(desc); + irq_enable(desc); check_irq_resend(desc, irq); /* fall-through */ } @@ -318,21 +438,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); if (!desc) return; + if (WARN(!desc->irq_data.chip, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + goto out; - if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); +out: + irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL(enable_irq); @@ -348,7 +465,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * set_irq_wake - control irq power management wakeup + * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * @@ -359,23 +476,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". */ -int set_irq_wake(unsigned int irq, unsigned int on) +int irq_set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else - desc->status |= IRQ_WAKEUP; + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { @@ -385,14 +501,13 @@ int set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 1; else - desc->status &= ~IRQ_WAKEUP; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } - - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_busunlock(desc, flags); return ret; } -EXPORT_SYMBOL(set_irq_wake); +EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a @@ -401,43 +516,27 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + int canrequest = 0; if (!desc) return 0; - if (desc->status & IRQ_NOREQUEST) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - if (action) - if (irqflags & action->flags & IRQF_SHARED) - action = NULL; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return !action; -} - -void compat_irq_chip_set_default_handler(struct irq_desc *desc) -{ - /* - * If the architecture still has not overriden - * the flow handler then zap the default. This - * should catch incorrect flow-type setting. - */ - if (desc->handle_irq == &handle_bad_irq) - desc->handle_irq = NULL; + if (irq_settings_can_request(desc)) { + if (desc->action) + if (irqflags & desc->action->flags & IRQF_SHARED) + canrequest =1; + } + irq_put_desc_unlock(desc, flags); + return canrequest; } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { - int ret; struct irq_chip *chip = desc->irq_data.chip; + int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* @@ -449,23 +548,43 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return 0; } + flags &= IRQ_TYPE_SENSE_MASK; + + if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { + if (!(desc->istate & IRQS_MASKED)) + mask_irq(desc); + if (!(desc->istate & IRQS_DISABLED)) + unmask = 1; + } + /* caller masked out all except trigger mode flags */ ret = chip->irq_set_type(&desc->irq_data, flags); - if (ret) - pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); - else { - if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) - flags |= IRQ_LEVEL; - /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ - desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); - desc->status |= flags; + switch (ret) { + case IRQ_SET_MASK_OK: + irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); + irqd_set(&desc->irq_data, flags); + + case IRQ_SET_MASK_OK_NOCOPY: + flags = irqd_get_trigger_type(&desc->irq_data); + irq_settings_set_trigger_mask(desc, flags); + irqd_clear(&desc->irq_data, IRQD_LEVEL); + irq_settings_clr_level(desc); + if (flags & IRQ_TYPE_LEVEL_MASK) { + irq_settings_set_level(desc); + irqd_set(&desc->irq_data, IRQD_LEVEL); + } if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); + ret = 0; + break; + default: + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); } - + if (unmask) + unmask_irq(desc); return ret; } @@ -509,8 +628,11 @@ static int irq_wait_for_interrupt(struct irqaction *action) * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ -static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +static void irq_finalize_oneshot(struct irq_desc *desc, + struct irqaction *action, bool force) { + if (!(desc->istate & IRQS_ONESHOT)) + return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); @@ -522,26 +644,44 @@ again: * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due - * to IRQ_INPROGRESS and the irq line is masked forever. + * to IRQS_INPROGRESS and the irq line is masked forever. + * + * This also serializes the state of shared oneshot handlers + * versus "desc->threads_onehsot |= action->thread_mask;" in + * irq_wake_thread(). See the comment there which explains the + * serialization. */ - if (unlikely(desc->status & IRQ_INPROGRESS)) { + if (unlikely(desc->istate & IRQS_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } - if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { - desc->status &= ~IRQ_MASKED; + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the threads_oneshot bit of this thread which + * was just set. + */ + if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + goto out_unlock; + + desc->threads_oneshot &= ~action->thread_mask; + + if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && + (desc->istate & IRQS_MASKED)) { + irq_compat_clr_masked(desc); + desc->istate &= ~IRQS_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } +out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } #ifdef CONFIG_SMP /* - * Check whether we need to change the affinity of the interrupt thread. + * Check whether we need to chasnge the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) @@ -573,6 +713,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif /* + * Interrupts which are not explicitely requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. + */ +static void +irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + local_bh_disable(); + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); + local_bh_enable(); +} + +/* + * Interrupts explicitely requested as threaded interupts want to be + * preemtible - many of them need to sleep and wait for slow busses to + * complete. + */ +static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); +} + +/* * Interrupt handler thread */ static int irq_thread(void *data) @@ -582,7 +748,14 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->status & IRQ_ONESHOT; + void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + int wake; + + if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; + else + handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -594,23 +767,20 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->status & IRQ_DISABLED)) { + if (unlikely(desc->istate & IRQS_DISABLED)) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which * retriggers the thread in check_irq_resend() - * but AFAICT IRQ_PENDING should be fine as it + * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { raw_spin_unlock_irq(&desc->lock); - - action->thread_fn(action->irq, action->dev_id); - - if (oneshot) - irq_finalize_oneshot(action->irq, desc); + handler_fn(desc, action); } wake = atomic_dec_and_test(&desc->threads_active); @@ -619,6 +789,9 @@ static int irq_thread(void *data) wake_up(&desc->wait_for_threads); } + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action, true); + /* * Clear irqaction. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. @@ -633,6 +806,7 @@ static int irq_thread(void *data) void exit_irq_thread(void) { struct task_struct *tsk = current; + struct irq_desc *desc; if (!tsk->irqaction) return; @@ -641,6 +815,14 @@ void exit_irq_thread(void) "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + desc = irq_to_desc(tsk->irqaction->irq); + + /* + * Prevent a stale desc->threads_oneshot. Must be called + * before setting the IRQTF_DIED flag. + */ + irq_finalize_oneshot(desc, tsk->irqaction, true); + /* * Set the THREAD DIED flag to prevent further wakeups of the * soon to be gone threaded handler. @@ -648,6 +830,22 @@ void exit_irq_thread(void) set_bit(IRQTF_DIED, &tsk->irqaction->flags); } +static void irq_setup_forced_threading(struct irqaction *new) +{ + if (!force_irqthreads) + return; + if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) + return; + + new->flags |= IRQF_ONESHOT; + + if (!new->thread_fn) { + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + } +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -657,9 +855,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; const char *old_name = NULL; - unsigned long flags; - int nested, shared = 0; - int ret; + unsigned long flags, thread_mask = 0; + int ret, nested, shared = 0; + cpumask_var_t mask; if (!desc) return -EINVAL; @@ -683,15 +881,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } - /* Oneshot interrupts are not allowed with shared */ - if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) - return -EINVAL; - /* * Check whether the interrupt nests into another interrupt * thread. */ - nested = desc->status & IRQ_NESTED_THREAD; + nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) return -EINVAL; @@ -701,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; + } else { + irq_setup_forced_threading(new); } /* @@ -724,6 +920,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread = t; } + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_thread; + } + /* * The following block of code has to be executed atomically */ @@ -735,29 +936,40 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which - * set the trigger type must match. + * set the trigger type must match. Also all must + * agree on ONESHOT. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + ((old->flags ^ new->flags) & IRQF_ONESHOT)) { old_name = old->name; goto mismatch; } -#if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; -#endif /* add new interrupt at end of irq queue */ do { + thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } + /* + * Setup the thread mask for this irqaction. Unlikely to have + * 32 resp 64 irqs sharing one line, but who knows. + */ + if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + new->thread_mask = 1 << ffz(thread_mask); + if (!shared) { irq_chip_set_defaults(desc->irq_data.chip); @@ -769,42 +981,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags & IRQF_TRIGGER_MASK); if (ret) - goto out_thread; - } else - compat_irq_chip_set_default_handler(desc); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif + goto out_mask; + } - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | - IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ + IRQS_INPROGRESS | IRQS_ONESHOT | \ + IRQS_WAITING); + + if (new->flags & IRQF_PERCPU) { + irqd_set(&desc->irq_data, IRQD_PER_CPU); + irq_settings_set_per_cpu(desc); + } if (new->flags & IRQF_ONESHOT) - desc->status |= IRQ_ONESHOT; + desc->istate |= IRQS_ONESHOT; - if (!(desc->status & IRQ_NOAUTOEN)) { - desc->depth = 0; - desc->status &= ~IRQ_DISABLED; - desc->irq_data.chip->irq_startup(&desc->irq_data); - } else + if (irq_settings_can_autoenable(desc)) + irq_startup(desc); + else /* Undo nested disables: */ desc->depth = 1; /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc); - - } else if ((new->flags & IRQF_TRIGGER_MASK) - && (new->flags & IRQF_TRIGGER_MASK) - != (desc->status & IRQ_TYPE_SENSE_MASK)) { - /* hope the handler works with the actual trigger mode... */ - pr_warning("IRQ %d uses trigger mode %d; requested %d\n", - irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), - (int)(new->flags & IRQF_TRIGGER_MASK)); + setup_affinity(irq, desc, mask); + + } else if (new->flags & IRQF_TRIGGER_MASK) { + unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; + unsigned int omsk = irq_settings_get_trigger_mask(desc); + + if (nmsk != omsk) + /* hope the handler works with current trigger mode */ + pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } new->irq = irq; @@ -818,8 +1032,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ - if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { - desc->status &= ~IRQ_SPURIOUS_DISABLED; + if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { + desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc, irq, false); } @@ -849,6 +1063,9 @@ mismatch: #endif ret = -EBUSY; +out_mask: + free_cpumask_var(mask); + out_thread: raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { @@ -871,9 +1088,14 @@ out_thread: */ int setup_irq(unsigned int irq, struct irqaction *act) { + int retval; struct irq_desc *desc = irq_to_desc(irq); - return __setup_irq(irq, desc, act); + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; } EXPORT_SYMBOL_GPL(setup_irq); @@ -924,13 +1146,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) { - desc->status |= IRQ_DISABLED; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else - desc->irq_data.chip->irq_disable(&desc->irq_data); - } + if (!desc->action) + irq_shutdown(desc); #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ @@ -1004,6 +1221,11 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; +#ifdef CONFIG_SMP + if (WARN_ON(desc->affinity_notify)) + desc->affinity_notify = NULL; +#endif + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); chip_bus_sync_unlock(desc); @@ -1074,7 +1296,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NOREQUEST) + if (!irq_settings_can_request(desc)) return -EINVAL; if (!handler) { @@ -1100,7 +1322,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (retval) kfree(action); -#ifdef CONFIG_DEBUG_SHIRQ +#ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it @@ -1149,7 +1371,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NESTED_THREAD) { + if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 441fd62..ec4806d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,23 +4,23 @@ #include "internals.h" -void move_masked_irq(int irq) +void irq_move_masked_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_desc *desc = irq_data_to_desc(idata); + struct irq_chip *chip = idata->chip; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (CHECK_IRQ_PER_CPU(desc->status)) { + if (!irqd_can_balance(&desc->irq_data)) { WARN_ON(1); return; } - desc->status &= ~IRQ_MOVE_PENDING; + irqd_clr_move_pending(&desc->irq_data); if (unlikely(cpumask_empty(desc->pending_mask))) return; @@ -53,15 +53,20 @@ void move_masked_irq(int irq) cpumask_clear(desc->pending_mask); } -void move_native_irq(int irq) +void move_masked_irq(int irq) +{ + irq_move_masked_irq(irq_get_irq_data(irq)); +} + +void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(idata); bool masked; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(idata))) return; - if (unlikely(desc->status & IRQ_DISABLED)) + if (unlikely(desc->istate & IRQS_DISABLED)) return; /* @@ -69,10 +74,15 @@ void move_native_irq(int irq) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->status & IRQ_MASKED; + masked = desc->istate & IRQS_MASKED; if (!masked) - desc->irq_data.chip->irq_mask(&desc->irq_data); - move_masked_irq(irq); + idata->chip->irq_mask(idata); + irq_move_masked_irq(idata); if (!masked) - desc->irq_data.chip->irq_unmask(&desc->irq_data); + idata->chip->irq_unmask(idata); +} + +void move_native_irq(int irq) +{ + irq_move_irq(irq_get_irq_data(irq)); } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d8..f76fc00 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -18,7 +18,7 @@ * During system-wide suspend or hibernation device drivers need to be prevented * from receiving interrupts and this function is provided for this purpose. * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQ_SUSPENDED flag for each of them. + * and sets the IRQS_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { @@ -34,7 +34,7 @@ void suspend_device_irqs(void) } for_each_irq_desc(irq, desc) - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) synchronize_irq(irq); } EXPORT_SYMBOL_GPL(suspend_device_irqs); @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQ_SUSPENDED flag set. + * have the IRQS_SUSPENDED flag set. */ void resume_device_irqs(void) { @@ -53,9 +53,6 @@ void resume_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - if (!(desc->status & IRQ_SUSPENDED)) - continue; - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -71,9 +68,24 @@ int check_wakeup_irqs(void) struct irq_desc *desc; int irq; - for_each_irq_desc(irq, desc) - if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) - return -EBUSY; + for_each_irq_desc(irq, desc) { + if (irqd_is_wakeup_set(&desc->irq_data)) { + if (desc->istate & IRQS_PENDING) + return -EBUSY; + continue; + } + /* + * Check the non wakeup interrupts whether they need + * to be masked before finally going into suspend + * state. That's for hardware which has no wakeup + * source configuration facility. The chip + * implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (desc->istate & IRQS_SUSPENDED && + irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + } return 0; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c8a2a9..4cc2e5e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> +#include <linux/kernel_stat.h> #include "internals.h" @@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) + if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif seq_cpumask(m, mask); @@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || - irq_balancing_disabled(irq)) + if (!irq_can_set_affinity(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) @@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq) ? -EINVAL : count; + err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; } else { irq_set_affinity(irq, new_value); err = count; @@ -357,3 +357,65 @@ void init_irq_proc(void) } } +#ifdef CONFIG_GENERIC_IRQ_SHOW + +int __weak arch_show_interrupts(struct seq_file *p, int prec) +{ + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + static int prec; + + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return arch_show_interrupts(p, prec); + + /* print header and calculate the width of the first column */ + if (i == 0) { + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + raw_spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %8s", desc->irq_data.chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +#endif diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 891115a..ad683a9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -23,7 +23,7 @@ #ifdef CONFIG_HARDIRQS_SW_RESEND /* Bitmap to handle software resend of interrupts: */ -static DECLARE_BITMAP(irqs_resend, NR_IRQS); +static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); /* * Run software resends of IRQ's @@ -55,20 +55,19 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - unsigned int status = desc->status; - - /* - * Make sure the interrupt is enabled, before resending it: - */ - desc->irq_data.chip->irq_enable(&desc->irq_data); - /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still * active. */ - if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; + if (irq_settings_is_level(desc)) + return; + if (desc->istate & IRQS_REPLAY) + return; + if (desc->istate & IRQS_PENDING) { + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; + desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 0000000..0227ad3 --- /dev/null +++ b/kernel/irq/settings.h @@ -0,0 +1,138 @@ +/* + * Internal header to deal with irq_desc->status which will be renamed + * to irq_desc->settings. + */ +enum { + _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, + _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_LEVEL = IRQ_LEVEL, + _IRQ_NOPROBE = IRQ_NOPROBE, + _IRQ_NOREQUEST = IRQ_NOREQUEST, + _IRQ_NOAUTOEN = IRQ_NOAUTOEN, + _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, + _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, + _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, +}; + +#define IRQ_INPROGRESS GOT_YOU_MORON +#define IRQ_REPLAY GOT_YOU_MORON +#define IRQ_WAITING GOT_YOU_MORON +#define IRQ_DISABLED GOT_YOU_MORON +#define IRQ_PENDING GOT_YOU_MORON +#define IRQ_MASKED GOT_YOU_MORON +#define IRQ_WAKEUP GOT_YOU_MORON +#define IRQ_MOVE_PENDING GOT_YOU_MORON +#define IRQ_PER_CPU GOT_YOU_MORON +#define IRQ_NO_BALANCING GOT_YOU_MORON +#define IRQ_AFFINITY_SET GOT_YOU_MORON +#define IRQ_LEVEL GOT_YOU_MORON +#define IRQ_NOPROBE GOT_YOU_MORON +#define IRQ_NOREQUEST GOT_YOU_MORON +#define IRQ_NOAUTOEN GOT_YOU_MORON +#define IRQ_NESTED_THREAD GOT_YOU_MORON +#undef IRQF_MODIFY_MASK +#define IRQF_MODIFY_MASK GOT_YOU_MORON + +static inline void +irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) +{ + desc->status &= ~(clr & _IRQF_MODIFY_MASK); + desc->status |= (set & _IRQF_MODIFY_MASK); +} + +static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) +{ + return desc->status & _IRQ_PER_CPU; +} + +static inline void irq_settings_set_per_cpu(struct irq_desc *desc) +{ + desc->status |= _IRQ_PER_CPU; +} + +static inline void irq_settings_set_no_balancing(struct irq_desc *desc) +{ + desc->status |= _IRQ_NO_BALANCING; +} + +static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) +{ + return desc->status & _IRQ_NO_BALANCING; +} + +static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) +{ + return desc->status & IRQ_TYPE_SENSE_MASK; +} + +static inline void +irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) +{ + desc->status &= ~IRQ_TYPE_SENSE_MASK; + desc->status |= mask & IRQ_TYPE_SENSE_MASK; +} + +static inline bool irq_settings_is_level(struct irq_desc *desc) +{ + return desc->status & _IRQ_LEVEL; +} + +static inline void irq_settings_clr_level(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_LEVEL; +} + +static inline void irq_settings_set_level(struct irq_desc *desc) +{ + desc->status |= _IRQ_LEVEL; +} + +static inline bool irq_settings_can_request(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOREQUEST); +} + +static inline void irq_settings_clr_norequest(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOREQUEST; +} + +static inline void irq_settings_set_norequest(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOREQUEST; +} + +static inline bool irq_settings_can_probe(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOPROBE); +} + +static inline void irq_settings_clr_noprobe(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOPROBE; +} + +static inline void irq_settings_set_noprobe(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOPROBE; +} + +static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status & _IRQ_MOVE_PCNTXT; +} + +static inline bool irq_settings_can_autoenable(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOAUTOEN); +} + +static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) +{ + return desc->status & _IRQ_NESTED_THREAD; +} + +/* Nothing should touch desc->status from now on */ +#undef status +#define status USE_THE_PROPER_WRAPPERS_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3089d3b9..dd586eb 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -21,70 +21,94 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static int irq_poll_cpu; +static atomic_t irq_poll_active; + +/* + * We wait here for a poller to finish. + * + * If the poll runs on this CPU, then we yell loudly and return + * false. That will leave the interrupt line disabled in the worst + * case, but it should never happen. + * + * We wait until the poller is done and then recheck disabled and + * action (about to be disabled). Only if it's still active, we return + * true and let the handler run. + */ +bool irq_wait_for_poll(struct irq_desc *desc) +{ + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + +#ifdef CONFIG_SMP + do { + raw_spin_unlock(&desc->lock); + while (desc->istate & IRQS_INPROGRESS) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (desc->istate & IRQS_INPROGRESS); + /* Might have been disabled in meantime */ + return !(desc->istate & IRQS_DISABLED) && desc->action; +#else + return false; +#endif +} + /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc) +static int try_one_irq(int irq, struct irq_desc *desc, bool force) { + irqreturn_t ret = IRQ_NONE; struct irqaction *action; - int ok = 0, work = 0; raw_spin_lock(&desc->lock); - /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - raw_spin_unlock(&desc->lock); - return ok; - } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; - raw_spin_unlock(&desc->lock); - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(irq, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; - } - local_irq_disable(); - /* Now clean up the flags */ - raw_spin_lock(&desc->lock); - action = desc->action; + /* PER_CPU and nested thread interrupts are never polled */ + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) + goto out; /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: + * Do not poll disabled interrupts unless the spurious + * disabled poller asks explicitely. */ - while ((desc->status & IRQ_PENDING) && action) { + if ((desc->istate & IRQS_DISABLED) && !force) + goto out; + + /* + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. + */ + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || + (action->flags & __IRQF_TIMER) || !action->next) + goto out; + + /* Already running on another processor */ + if (desc->istate & IRQS_INPROGRESS) { /* - * Perform real IRQ processing for the IRQ we deferred + * Already running: If it is shared get the other + * CPU to go looking for our mystery interrupt too */ - work = 1; - raw_spin_unlock(&desc->lock); - handle_IRQ_event(irq, action); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + goto out; } - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too - */ - if (work) - irq_end(irq, desc); - raw_spin_unlock(&desc->lock); - return ok; + /* Mark it poll in progress */ + desc->istate |= IRQS_POLL_INPROGRESS; + do { + if (handle_irq_event(desc) == IRQ_HANDLED) + ret = IRQ_HANDLED; + action = desc->action; + } while ((desc->istate & IRQS_PENDING) && action); + desc->istate &= ~IRQS_POLL_INPROGRESS; +out: + raw_spin_unlock(&desc->lock); + return ret == IRQ_HANDLED; } static int misrouted_irq(int irq) @@ -92,6 +116,11 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; + if (atomic_inc_return(&irq_poll_active) == 1) + goto out; + + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { if (!i) continue; @@ -99,9 +128,11 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc)) + if (try_one_irq(i, desc, false)) ok = 1; } +out: + atomic_dec(&irq_poll_active); /* So the caller can adjust the irq error counts */ return ok; } @@ -111,23 +142,28 @@ static void poll_spurious_irqs(unsigned long dummy) struct irq_desc *desc; int i; + if (atomic_inc_return(&irq_poll_active) != 1) + goto out; + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { - unsigned int status; + unsigned int state; if (!i) continue; /* Racy but it doesn't matter */ - status = desc->status; + state = desc->istate; barrier(); - if (!(status & IRQ_SPURIOUS_DISABLED)) + if (!(state & IRQS_SPURIOUS_DISABLED)) continue; local_irq_disable(); - try_one_irq(i, desc); + try_one_irq(i, desc, true); local_irq_enable(); } - +out: + atomic_dec(&irq_poll_active); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } @@ -139,15 +175,13 @@ static void poll_spurious_irqs(unsigned long dummy) * * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) - * - * Called under desc->lock */ - static void __report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { struct irqaction *action; + unsigned long flags; if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { printk(KERN_ERR "irq event %d: bogus return value %x\n", @@ -159,6 +193,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, dump_stack(); printk(KERN_ERR "handlers:\n"); + /* + * We need to take desc->lock here. note_interrupt() is called + * w/o desc->lock held, but IRQ_PROGRESS set. We might race + * with something else removing an action. It's ok to take + * desc->lock here. See synchronize_irq(). + */ + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { printk(KERN_ERR "[<%p>]", action->handler); @@ -167,6 +208,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, printk("\n"); action = action->next; } + raw_spin_unlock_irqrestore(&desc->lock, flags); } static void @@ -218,6 +260,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { + if (desc->istate & IRQS_POLL_INPROGRESS) + return; + if (unlikely(action_ret != IRQ_HANDLED)) { /* * If we are seeing only the odd spurious IRQ caused by @@ -254,9 +299,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; + desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; - desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_disable(desc); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 999835b..ed253aa 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,13 +38,96 @@ #include <asm/irq_regs.h> +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; -atomic_t perf_task_events __read_mostly; +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +atomic_t perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* * max perf event sample rate */ -int sysctl_perf_event_sample_rate __read_mostly = 100000; +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} static atomic64_t perf_event_id; @@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } @@ -89,6 +193,360 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp; + + /* + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) + */ + if (!is_cgroup_event(event)) + return; + + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* must be done before we fput() the file */ + perf_get_cgroup(event); + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } +out: + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -254,7 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); } /* @@ -271,6 +728,10 @@ static void update_context_time(struct perf_event_context *ctx) static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + return ctx ? ctx->time : 0; } @@ -285,9 +746,20 @@ static void update_event_times(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - - if (ctx->is_active) + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; else run_end = event->tstamp_stopped; @@ -299,6 +771,7 @@ static void update_event_times(struct perf_event *event) run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; + } /* @@ -347,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } + if (is_cgroup_event(event)) + ctx->nr_cgroups++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -473,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; + if (is_cgroup_event(event)) + ctx->nr_cgroups--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -544,7 +1023,8 @@ out: static inline int event_filter_match(struct perf_event *event) { - return event->cpu == -1 || event->cpu == smp_processor_id(); + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); } static void @@ -562,7 +1042,7 @@ event_sched_out(struct perf_event *event, */ if (event->state == PERF_EVENT_STATE_INACTIVE && !event_filter_match(event)) { - delta = ctx->time - event->tstamp_stopped; + delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; } @@ -606,47 +1086,30 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static void __perf_event_remove_from_context(void *info) +static int __perf_remove_from_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - raw_spin_unlock(&ctx->lock); + + return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * Must be called with ctx->mutex held. - * * CPU events are removed with a smp call. For task events we only * call when the task is on a CPU. * @@ -657,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_event_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + if (!task) { /* * Per cpu events are removed via an smp call and * the removal is always successful. */ - smp_call_function_single(event->cpu, - __perf_event_remove_from_context, - event, 1); + cpu_function_call(event->cpu, __perf_remove_from_context, event); return; } retry: - task_oncpu_function_call(task, __perf_event_remove_from_context, - event); + if (!task_function_call(task, __perf_remove_from_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * If the context is active we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->nr_active && !list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can remove the event safely, if the call above did not - * succeed. + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. */ - if (!list_empty(&event->group_entry)) - list_del_event(event, ctx); + list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to disable a performance event */ -static void __perf_event_disable(void *info) +static int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -708,9 +1170,12 @@ static void __perf_event_disable(void *info) /* * If this is a per-task event, need to check whether this * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. */ if (ctx->task && cpuctx->task_ctx != ctx) - return; + return -EINVAL; raw_spin_lock(&ctx->lock); @@ -720,6 +1185,7 @@ static void __perf_event_disable(void *info) */ if (event->state >= PERF_EVENT_STATE_INACTIVE) { update_context_time(ctx); + update_cgrp_time_from_event(event); update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); @@ -729,6 +1195,8 @@ static void __perf_event_disable(void *info) } raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -753,13 +1221,13 @@ void perf_event_disable(struct perf_event *event) /* * Disable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_disable, - event, 1); + cpu_function_call(event->cpu, __perf_event_disable, event); return; } retry: - task_oncpu_function_call(task, __perf_event_disable, event); + if (!task_function_call(task, __perf_event_disable, event)) + return; raw_spin_lock_irq(&ctx->lock); /* @@ -767,6 +1235,11 @@ retry: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -778,10 +1251,48 @@ retry: update_group_times(event); event->state = PERF_EVENT_STATE_OFF; } - raw_spin_unlock_irq(&ctx->lock); } +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + static int event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, @@ -794,6 +1305,17 @@ event_sched_in(struct perf_event *event, event->state = PERF_EVENT_STATE_ACTIVE; event->oncpu = smp_processor_id(); + + /* + * Unthrottle events, since we scheduled we might have missed several + * ticks already, also for a heavily scheduling task there is little + * guarantee it'll get a tick in a timely manner. + */ + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { + perf_log_throttle(event, 1); + event->hw.interrupts = 0; + } + /* * The new state must be visible before we turn it on in the hardware: */ @@ -807,7 +1329,7 @@ event_sched_in(struct perf_event *event, event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = tstamp - ctx->timestamp; + perf_set_shadow_time(event, ctx, tstamp); if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -928,12 +1450,15 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); + /* * Cross CPU call to install and enable a performance event * * Must be called with ctx->mutex held */ -static void __perf_install_in_context(void *info) +static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -942,21 +1467,22 @@ static void __perf_install_in_context(void *info) int err; /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no events. + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx, ctx->task); raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); add_event_to_ctx(event, ctx); @@ -997,6 +1523,8 @@ static void __perf_install_in_context(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1008,8 +1536,6 @@ unlock: * If the event is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_event_context *ctx, @@ -1018,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + event->ctx = ctx; if (!task) { @@ -1025,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx, * Per cpu events are installed via an smp call and * the install is always successful. */ - smp_call_function_single(cpu, __perf_install_in_context, - event, 1); + cpu_function_call(cpu, __perf_install_in_context, event); return; } retry: - task_oncpu_function_call(task, __perf_install_in_context, - event); + if (!task_function_call(task, __perf_install_in_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->is_active && list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can add the event safely, if it the call above did not - * succeed. + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. */ - if (list_empty(&event->group_entry)) - add_event_to_ctx(event, ctx); + add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1078,7 +1604,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, /* * Cross CPU call to enable a performance event */ -static void __perf_event_enable(void *info) +static int __perf_event_enable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -1086,26 +1612,27 @@ static void __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, ctx); + __perf_event_mark_enabled(event, ctx); - if (!event_filter_match(event)) + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); goto unlock; + } /* * If the event is in a group and isn't the group leader, @@ -1138,6 +1665,8 @@ static void __perf_event_enable(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1158,8 +1687,7 @@ void perf_event_enable(struct perf_event *event) /* * Enable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_enable, - event, 1); + cpu_function_call(event->cpu, __perf_event_enable, event); return; } @@ -1178,8 +1706,15 @@ void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + raw_spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_event_enable, event); + + if (!task_function_call(task, __perf_event_enable, event)) + return; raw_spin_lock_irq(&ctx->lock); @@ -1187,15 +1722,14 @@ retry: * If the context is active and the event is still off, * we need to retry the cross-call. */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_OFF) - __perf_event_mark_enabled(event, ctx); + } out: raw_spin_unlock_irq(&ctx->lock); @@ -1227,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); if (!ctx->nr_active) goto out; @@ -1339,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; @@ -1416,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task, for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); } static void task_ctx_sched_out(struct perf_event_context *ctx, @@ -1454,6 +1997,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -1486,6 +2033,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -1496,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { + u64 now; + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; - ctx->timestamp = perf_clock(); - + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1521,11 +2076,12 @@ out: } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, task); } static void task_ctx_sched_in(struct perf_event_context *ctx, @@ -1533,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, { struct perf_cpu_context *cpuctx; - cpuctx = __get_cpu_context(ctx); + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, NULL); cpuctx->task_ctx = ctx; } -void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) { struct perf_cpu_context *cpuctx; @@ -1557,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); cpuctx->task_ctx = ctx; @@ -1592,14 +2149,17 @@ void __perf_event_task_sched_in(struct task_struct *task) if (likely(!ctx)) continue; - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, task); } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); } -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_event *event, int enable); - static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; @@ -1627,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ -#define REDUCE_FLS(a, b) \ +#define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ @@ -1797,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); if (ctx) task_ctx_sched_in(ctx, EVENT_FLEXIBLE); @@ -1876,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); } @@ -1901,8 +2461,10 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); @@ -1933,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event) * (e.g., thread is blocked), in that case * we cannot update context time */ - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2213,6 +2777,9 @@ errout: } +/* + * Returns a matching context with refcount and pincount. + */ static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { @@ -2237,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + ++ctx->pin_count; return ctx; } @@ -2250,6 +2818,7 @@ retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); + ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2271,8 +2840,10 @@ retry: err = -ESRCH; else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; - else + else { + ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { @@ -2312,7 +2883,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_task_events); + jump_label_dec(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2321,6 +2892,10 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } } if (event->buffer) { @@ -2328,6 +2903,9 @@ static void free_event(struct perf_event *event) event->buffer = NULL; } + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + if (event->destroy) event->destroy(event); @@ -4395,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (unlikely(!is_sampling_event(event))) return 0; - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_event_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling events even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the event got enabled again: - */ + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); ret = 1; } - } + } else + hwc->interrupts++; if (event->attr.freq) { u64 now = perf_clock(); @@ -5051,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + event->pmu->read(event); perf_sample_data_init(&data, 0); @@ -5077,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - period = local64_read(&hwc->period_left); if (period) { if (period < 0) @@ -5106,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) } } +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + /* * Software event: cpu wall time clock */ @@ -5158,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } @@ -5213,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } @@ -5235,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } @@ -5506,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; + int ret; idx = srcu_read_lock(&pmus_srcu); rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); - if (pmu) + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); goto unlock; + } list_for_each_entry_rcu(pmu, &pmus, entry) { - int ret = pmu->event_init(event); + ret = pmu->event_init(event); if (!ret) goto unlock; @@ -5642,7 +6235,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_task_events); + jump_label_inc(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5817,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open, int err; /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); @@ -5834,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + event_fd = get_unused_fd_flags(O_RDWR); if (event_fd < 0) return event_fd; @@ -5851,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } - if (pid != -1) { + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); @@ -5865,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + /* * Special case software events and allow them to be part of * any hardware group. @@ -5950,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_event_remove_from_context(group_leader); + perf_remove_from_context(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_event_remove_from_context(sibling); + perf_remove_from_context(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); @@ -5976,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); event->owner = current; @@ -6001,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: + perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); @@ -6051,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; @@ -6104,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - perf_event_remove_from_context(child_event); + perf_remove_from_context(child_event); parent_event = child_event->parent; /* @@ -6411,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -6526,6 +7144,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); return ret; } @@ -6595,9 +7214,9 @@ static void __perf_event_exit_context(void *__info) perf_pmu_rotate_stop(ctx->pmu); list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); } static void perf_event_exit_cpu_context(int cpu) @@ -6721,3 +7340,83 @@ unlock: return ret; } device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + + jc = kzalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void perf_cgroup_move(struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task, + bool threadgroup) +{ + perf_cgroup_move(task); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + perf_cgroup_move(c); + } + rcu_read_unlock(); + } +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_move(task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, +}; +#endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/pid.c b/kernel/pid.c index 39b65b6..02f2212 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_unlock(); return pid; } +EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { @@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) rcu_read_unlock(); return result; } +EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index aeaa7f8..0da058b 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = { static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); static int pm_qos_power_release(struct inode *inode, struct file *filp); static const struct file_operations pm_qos_power_fops = { .write = pm_qos_power_write, + .read = pm_qos_power_read, .open = pm_qos_power_open, .release = pm_qos_power_release, .llseek = noop_llseek, @@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) } +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + unsigned long flags; + struct pm_qos_object *o; + struct pm_qos_request_list *pm_qos_req = filp->private_data;; + + if (!pm_qos_req) + return -EINVAL; + if (!pm_qos_request_active(pm_qos_req)) + return -EINVAL; + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(o); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 05bb717..67fea9d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) return p->utime; } -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { int error = check_clock(which_clock); if (!error) { @@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) return error; } -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, } -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; @@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * This is called from sys_timer_create() and do_cpu_nanosleep() with the * new timer already all-zeros initialized. */ -int posix_cpu_timer_create(struct k_itimer *new_timer) +static int posix_cpu_timer_create(struct k_itimer *new_timer) { int ret = 0; const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); @@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_del(struct k_itimer *timer) +static int posix_cpu_timer_del(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; int ret = 0; @@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old) +static int posix_cpu_timer_set(struct k_itimer *timer, int flags, + struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; union cpu_time_count old_expires, new_expires, old_incr, val; @@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, return ret; } -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { union cpu_time_count now; struct task_struct *p = timer->it.cpu.task; @@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, return error; } -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + ¤t_thread_info()->restart_block; struct itimerspec it; int error; @@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (error == -ERESTART_RESTARTBLOCK) { - if (flags & TIMER_ABSTIME) + if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = rqtp->tv_sec; - restart_block->arg3 = rqtp->tv_nsec; + restart_block->nanosleep.index = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); } return error; } -long posix_cpu_nsleep_restart(struct restart_block *restart_block) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; - struct timespec __user *rmtp; + clockid_t which_clock = restart_block->nanosleep.index; struct timespec t; struct itimerspec it; int error; - rmtp = (struct timespec __user *) restart_block->arg1; - t.tv_sec = restart_block->arg2; - t.tv_nsec = restart_block->arg3; + t = ns_to_timespec(restart_block->nanosleep.expires); - restart_block->fn = do_no_restart_syscall; error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = t.tv_sec; - restart_block->arg3 = t.tv_nsec; + restart_block->nanosleep.expires = timespec_to_ns(&t); } return error; } - #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) @@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer) timer->it_clock = THREAD_CLOCK; return posix_cpu_timer_create(timer); } -static int thread_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - return -EINVAL; -} -static long thread_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} + +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; static __init int init_posix_cpu_timers(void) { struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; struct timespec ts; - register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); cputime_to_timespec(cputime_one_jiffy, &ts); onecputick = ts.tv_nsec; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 93bd2eb..4c01249 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -41,6 +41,7 @@ #include <linux/init.h> #include <linux/compiler.h> #include <linux/idr.h> +#include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> @@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock); #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif /* * The timer ID is turned into a timer address by idr_find(). @@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock); /* * CLOCKs: The POSIX standard calls for a couple of clocks and allows us * to implement others. This structure defines the various - * clocks and allows the possibility of adding others. We - * provide an interface to add clocks to the table and expect - * the "arch" code to add at least one clock that is high - * resolution. Here we define the standard CLOCK_REALTIME as a - * 1/HZ resolution clock. + * clocks. * * RESOLUTION: Clock resolution is used to round up timer and interval * times, NOT to report clock times, which are reported with as @@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock); * necessary code is written. The standard says we should say * something about this issue in the documentation... * - * FUNCTIONS: The CLOCKs structure defines possible functions to handle - * various clock functions. For clocks that use the standard - * system timer code these entries should be NULL. This will - * allow dispatch without the overhead of indirect function - * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) - * must supply functions here, even if the function just returns - * ENOSYS. The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_pid - * fields are not modified by timer code. + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. * - * At this time all functions EXCEPT clock_nanosleep can be - * redirected by the CLOCKS structure. Clock_nanosleep is in - * there, but the code ignores it. + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. * * Permissions: It is assumed that the clock_settime() function defined * for each clock will take care of permission checks. Some @@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; */ static int common_nsleep(const clockid_t, int flags, struct timespec *t, struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); static void common_timer_get(struct k_itimer *, struct itimerspec *); static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); @@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* - * Call the k_clock hook function if non-null, or the default function. - */ -#define CLOCK_DISPATCH(clock, call, arglist) \ - ((clock) < 0 ? posix_cpu_##call arglist : \ - (posix_clocks[clock].call != NULL \ - ? (*posix_clocks[clock].call) arglist : common_##call arglist)) - -/* - * Default clock hook functions when the struct k_clock passed - * to register_posix_clock leaves a function pointer null. - * - * The function common_CALL is the default implementation for - * the function pointer CALL in struct k_clock. - */ - -static inline int common_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = posix_clocks[which_clock].res; - return 0; -} - -/* - * Get real time for posix timers - */ -static int common_clock_get(clockid_t which_clock, struct timespec *tp) +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) { ktime_get_real_ts(tp); return 0; } -static inline int common_clock_set(const clockid_t which_clock, - struct timespec *tp) +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) { return do_sys_settimeofday(tp, NULL); } -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -static int no_timer_create(struct k_itimer *new_timer) -{ - return -EOPNOTSUPP; -} - -static int no_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return -EOPNOTSUPP; -} - -/* - * Return nonzero if we know a priori this clockid_t value is bogus. - */ -static inline int invalid_clockid(const clockid_t which_clock) +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) { - if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ - return 0; - if ((unsigned) which_clock >= MAX_CLOCKS) - return 1; - if (posix_clocks[which_clock].clock_getres != NULL) - return 0; - if (posix_clocks[which_clock].res != 0) - return 0; - return 1; + return do_adjtimex(t); } /* @@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* - * Get monotonic time for posix timers + * Get monotonic-raw time for posix timers */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) { @@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp *tp = ktime_to_timespec(KTIME_LOW_RES); return 0; } + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_monotonic_raw, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, }; struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; - register_posix_clock(CLOCK_REALTIME, &clock_realtime); - register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); - register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { - printk("POSIX clock register failed for clock_id %d\n", + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", clock_id); return; } posix_clocks[clock_id] = *new_clock; } -EXPORT_SYMBOL_GPL(register_posix_clock); +EXPORT_SYMBOL_GPL(posix_timers_register_clock); static struct k_itimer * alloc_posix_timer(void) { @@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) kmem_cache_free(posix_timers_cache, tmr); } +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + /* Create a POSIX.1b interval timer. */ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; int it_id_set = IT_ID_NOT_SET; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) @@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + error = kc->timer_create(new_timer); if (error) goto out; @@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, spin_unlock_irq(¤t->sighand->siglock); return 0; - /* + /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify @@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct k_itimer *timr; struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; unsigned long flags; + int ret = 0; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; - CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); unlock_timer(timr, flags); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; - return 0; + return ret; } /* @@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, int error = 0; unsigned long flag; struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; if (!new_setting) return -EINVAL; @@ -828,8 +838,11 @@ retry: if (!timr) return -EINVAL; - error = CLOCK_DISPATCH(timr->it_clock, timer_set, - (timr, flags, &new_spec, rtn)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { @@ -844,7 +857,7 @@ retry: return error; } -static inline int common_timer_del(struct k_itimer *timer) +static int common_timer_del(struct k_itimer *timer) { timer->it.real.interval.tv64 = 0; @@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); } /* Delete a POSIX.1b interval timer. */ @@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig) } } -/* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); - -int do_posix_clock_nonanosleep(const clockid_t clock, int flags, - struct timespec *t, struct timespec __user *r) -{ -#ifndef ENOTSUP - return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ -#else /* parisc does define it separately. */ - return -ENOTSUP; -#endif -} -EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); - SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec new_tp; - if (invalid_clockid(which_clock)) + if (!kc || !kc->clock_set) return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec kernel_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_get, - (which_clock, &kernel_tp)); + + error = kc->clock_get(which_clock, &kernel_tp); + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; return error; +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + return err; } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec rtn_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_getres, - (which_clock, &rtn_tp)); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; - } return error; } @@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec t; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; @@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!timespec_valid(&t)) return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep, - (which_clock, flags, &t, rmtp)); -} - -/* - * nanosleep_restart for monotonic and realtime clocks - */ -static int common_nsleep_restart(struct restart_block *restart_block) -{ - return hrtimer_nanosleep_restart(restart_block); + return kc->nsleep(which_clock, flags, &t, rmtp); } /* * This will restart clock_nanosleep. This is required only by * compat_clock_nanosleep_restart for now. */ -long -clock_nanosleep_restart(struct restart_block *restart_block) +long clock_nanosleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; + clockid_t which_clock = restart_block->nanosleep.index; + struct k_clock *kc = clockid_to_kclock(which_clock); + + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep_restart, - (restart_block)); + return kc->nsleep_restart(restart_block); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2657299..4603f08 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,125 +1,12 @@ -config PM - bool "Power Management support" - depends on !IA64_HP_SIM - ---help--- - "Power Management" means that parts of your computer are shut - off or put into a power conserving "sleep" mode if they are not - being used. There are two competing standards for doing this: APM - and ACPI. If you want to use either one, say Y here and then also - to the requisite support below. - - Power Management is most important for battery powered laptop - computers; if you have a laptop, check out the Linux Laptop home - page on the WWW at <http://www.linux-on-laptops.com/> or - Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> - and the Battery Powered Linux mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. - - Note that, even if you say N here, Linux on the x86 architecture - will issue the hlt instruction if nothing is to be done, thereby - sending the processor to sleep and saving power. - -config PM_DEBUG - bool "Power Management Debug Support" - depends on PM - ---help--- - This option enables various debugging support in the Power Management - code. This is helpful when debugging and reporting PM bugs, like - suspend support. - -config PM_ADVANCED_DEBUG - bool "Extra PM attributes in sysfs for low-level debugging/testing" - depends on PM_DEBUG - default n - ---help--- - Add extra sysfs attributes allowing one to access some Power Management - fields of device objects from user space. If you are not a kernel - developer interested in debugging/testing Power Management, say "no". - -config PM_VERBOSE - bool "Verbose Power Management debugging" - depends on PM_DEBUG - default n - ---help--- - This option enables verbose messages from the Power Management code. - -config CAN_PM_TRACE - def_bool y - depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL - -config PM_TRACE - bool - help - This enables code to save the last PM event point across - reboot. The architecture needs to support this, x86 for - example does by saving things in the RTC, see below. - - The architecture specific code must provide the extern - functions from <linux/resume-trace.h> as well as the - <asm/resume-trace.h> header with a TRACE_RESUME() macro. - - The way the information is presented is architecture- - dependent, x86 will print the information during a - late_initcall. - -config PM_TRACE_RTC - bool "Suspend/resume event tracing" - depends on CAN_PM_TRACE - depends on X86 - select PM_TRACE - default n - ---help--- - This enables some cheesy code to save the last PM event point in the - RTC across reboots, so that you can debug a machine that just hangs - during suspend (or more commonly, during resume). - - To use this debugging feature you should attempt to suspend the - machine, reboot it and then run - - dmesg -s 1000000 | grep 'hash matches' - - CAUTION: this option will cause your machine's real-time clock to be - set to an invalid time after a resume. - -config PM_SLEEP_SMP - bool - depends on SMP - depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE - depends on PM_SLEEP - select HOTPLUG - select HOTPLUG_CPU - default y - -config PM_SLEEP - bool - depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE - default y - -config PM_SLEEP_ADVANCED_DEBUG - bool - depends on PM_ADVANCED_DEBUG - default n - config SUSPEND bool "Suspend to RAM and standby" - depends on PM && ARCH_SUSPEND_POSSIBLE + depends on ARCH_SUSPEND_POSSIBLE default y ---help--- Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). -config PM_TEST_SUSPEND - bool "Test suspend/resume and wakealarm during bootup" - depends on SUSPEND && PM_DEBUG && RTC_CLASS=y - ---help--- - This option will let you suspend your machine during bootup, and - make it wake up a few seconds later using an RTC wakeup alarm. - Enable this with a kernel parameter like "test_suspend=mem". - - You probably want to have your system's RTC driver statically - linked, ensuring that it's available when this test runs. - config SUSPEND_FREEZER bool "Enable freezer for suspend to RAM/standby" \ if ARCH_WANTS_FREEZER_CONTROL || BROKEN @@ -133,7 +20,7 @@ config SUSPEND_FREEZER config HIBERNATION bool "Hibernation (aka 'suspend to disk')" - depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + depends on SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS ---help--- @@ -196,6 +83,106 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config PM_SLEEP + def_bool y + depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE + +config PM_SLEEP_SMP + def_bool y + depends on SMP + depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE + depends on PM_SLEEP + select HOTPLUG + select HOTPLUG_CPU + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on !IA64_HP_SIM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. + +config PM + def_bool y + depends on PM_SLEEP || PM_RUNTIME + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + ---help--- + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_VERBOSE + bool "Verbose Power Management debugging" + depends on PM_DEBUG + ---help--- + This option enables verbose messages from the Power Management code. + +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config CAN_PM_TRACE + def_bool y + depends on PM_DEBUG && PM_SLEEP + +config PM_TRACE + bool + help + This enables code to save the last PM event point across + reboot. The architecture needs to support this, x86 for + example does by saving things in the RTC, see below. + + The architecture specific code must provide the extern + functions from <linux/resume-trace.h> as well as the + <asm/resume-trace.h> header with a TRACE_RESUME() macro. + + The way the information is presented is architecture- + dependent, x86 will print the information during a + late_initcall. + +config PM_TRACE_RTC + bool "Suspend/resume event tracing" + depends on CAN_PM_TRACE + depends on X86 + select PM_TRACE + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + config APM_EMULATION tristate "Advanced Power Management Emulation" depends on PM && SYS_SUPPORTS_APM_EMULATION @@ -222,31 +209,11 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config PM_RUNTIME - bool "Run-time PM core functionality" - depends on PM - ---help--- - Enable functionality allowing I/O devices to be put into energy-saving - (low power) states at run time (or autosuspended) after a specified - period of inactivity and woken up in response to a hardware-generated - wake-up event or a driver's request. - - Hardware support is generally required for this functionality to work - and the bus type drivers of the buses the devices are on are - responsible for the actual handling of the autosuspend requests and - wake-up events. - -config PM_OPS - bool - depends on PM_SLEEP || PM_RUNTIME - default y - config ARCH_HAS_OPP bool config PM_OPP bool "Operating Performance Point (OPP) Layer library" - depends on PM depends on ARCH_HAS_OPP ---help--- SOCs have a standard set of tuples consisting of frequency and diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1832bd2..aeabd26 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -23,6 +23,7 @@ #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/gfp.h> +#include <linux/syscore_ops.h> #include <scsi/scsi_scan.h> #include <asm/suspend.h> @@ -272,6 +273,8 @@ static int create_image(int platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_FREEZE); + if (!error) + error = syscore_suspend(); if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); @@ -295,6 +298,7 @@ static int create_image(int platform_mode) } Power_up: + syscore_resume(); sysdev_resume(); /* NOTE: dpm_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. @@ -403,6 +407,8 @@ static int resume_target_kernel(bool platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_QUIESCE); + if (!error) + error = syscore_suspend(); if (error) goto Enable_irqs; @@ -429,6 +435,7 @@ static int resume_target_kernel(bool platform_mode) restore_processor_state(); touch_softlockup_watchdog(); + syscore_resume(); sysdev_resume(); Enable_irqs: @@ -516,6 +523,7 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; @@ -526,6 +534,7 @@ int hibernation_platform_enter(void) while (1); Power_up: + syscore_resume(); sysdev_resume(); local_irq_enable(); enable_nonboot_cpus(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 7b5db6a..8eaba5f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -17,9 +17,6 @@ DEFINE_MUTEX(pm_mutex); -unsigned int pm_flags; -EXPORT_SYMBOL(pm_flags); - #ifdef CONFIG_PM_SLEEP /* Routines for PM-transition notifications */ @@ -326,7 +323,7 @@ EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); return pm_wq ? 0 : -ENOMEM; } diff --git a/kernel/power/process.c b/kernel/power/process.c index d6d2a10..0cf3a27 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,7 +22,7 @@ */ #define TIMEOUT (20 * HZ) -static inline int freezeable(struct task_struct * p) +static inline int freezable(struct task_struct * p) { if ((p == current) || (p->flags & PF_NOFREEZE) || @@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only) todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (frozen(p) || !freezeable(p)) + if (frozen(p) || !freezable(p)) continue; if (!freeze_task(p, sig_only)) @@ -167,7 +167,7 @@ static void thaw_tasks(bool nosig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezeable(p)) + if (!freezable(p)) continue; if (nosig_only && should_send_signal(p)) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0dac75e..ca0aacc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *); /* * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. + * When it is set to N, the image creating code will do its best to + * ensure the image size will not exceed N bytes, but if that is + * impossible, it will try to create the smallest image possible. */ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; + image_size = (totalram_pages / 3) * PAGE_SIZE; } /* List of PBEs needed for restoring the pages that were allocated before @@ -1519,11 +1519,8 @@ static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { - int error = 0; - if (nr_highmem > 0) { - error = get_highmem_buffer(PG_ANY); - if (error) + if (get_highmem_buffer(PG_ANY)) goto err_out; if (nr_highmem > alloc_highmem) { nr_highmem -= alloc_highmem; @@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, err_out: swsusp_free(); - return error; + return -ENOMEM; } asmlinkage int swsusp_save(void) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index de6f86b..2814c32 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/suspend.h> +#include <linux/syscore_ops.h> #include <trace/events/power.h> #include "power.h" @@ -163,11 +164,14 @@ static int suspend_enter(suspend_state_t state) BUG_ON(!irqs_disabled()); error = sysdev_suspend(PMSG_SUSPEND); + if (!error) + error = syscore_suspend(); if (!error) { if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); events_check_enabled = false; } + syscore_resume(); sysdev_resume(); } diff --git a/kernel/printk.c b/kernel/printk.c index 3623152..33284ad 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -515,6 +515,71 @@ static void _call_console_drivers(unsigned start, } /* + * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the + * lower 3 bit are the log level, the rest are the log facility. In case + * userspace passes usual userspace syslog messages to /dev/kmsg or + * /dev/ttyprintk, the log prefix might contain the facility. Printk needs + * to extract the correct log level for in-kernel processing, and not mangle + * the original value. + * + * If a prefix is found, the length of the prefix is returned. If 'level' is + * passed, it will be filled in with the log level without a possible facility + * value. If 'special' is passed, the special printk prefix chars are accepted + * and returned. If no valid header is found, 0 is returned and the passed + * variables are not touched. + */ +static size_t log_prefix(const char *p, unsigned int *level, char *special) +{ + unsigned int lev = 0; + char sp = '\0'; + size_t len; + + if (p[0] != '<' || !p[1]) + return 0; + if (p[2] == '>') { + /* usual single digit level number or special char */ + switch (p[1]) { + case '0' ... '7': + lev = p[1] - '0'; + break; + case 'c': /* KERN_CONT */ + case 'd': /* KERN_DEFAULT */ + sp = p[1]; + break; + default: + return 0; + } + len = 3; + } else { + /* multi digit including the level and facility number */ + char *endp = NULL; + + if (p[1] < '0' && p[1] > '9') + return 0; + + lev = (simple_strtoul(&p[1], &endp, 10) & 7); + if (endp == NULL || endp[0] != '>') + return 0; + len = (endp + 1) - p; + } + + /* do not accept special char if not asked for */ + if (sp && !special) + return 0; + + if (special) { + *special = sp; + /* return special char, do not touch level */ + if (sp) + return len; + } + + if (level) + *level = lev; + return len; +} + +/* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. @@ -529,13 +594,9 @@ static void call_console_drivers(unsigned start, unsigned end) cur_index = start; start_print = start; while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2) && - LOG_BUF(cur_index + 0) == '<' && - LOG_BUF(cur_index + 1) >= '0' && - LOG_BUF(cur_index + 1) <= '7' && - LOG_BUF(cur_index + 2) == '>') { - msg_level = LOG_BUF(cur_index + 1) - '0'; - cur_index += 3; + if (msg_level < 0 && ((end - cur_index) > 2)) { + /* strip log prefix */ + cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); start_print = cur_index; } while (cur_index != end) { @@ -733,6 +794,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) unsigned long flags; int this_cpu; char *p; + size_t plen; + char special; boot_delay_msec(); printk_delay(); @@ -773,45 +836,52 @@ asmlinkage int vprintk(const char *fmt, va_list args) printed_len += vscnprintf(printk_buf + printed_len, sizeof(printk_buf) - printed_len, fmt, args); - p = printk_buf; - /* Do we have a loglevel in the string? */ - if (p[0] == '<') { - unsigned char c = p[1]; - if (c && p[2] == '>') { - switch (c) { - case '0' ... '7': /* loglevel */ - current_log_level = c - '0'; - /* Fallthrough - make sure we're on a new line */ - case 'd': /* KERN_DEFAULT */ - if (!new_text_line) { - emit_log_char('\n'); - new_text_line = 1; - } - /* Fallthrough - skip the loglevel */ - case 'c': /* KERN_CONT */ - p += 3; - break; + /* Read log level and handle special printk prefix */ + plen = log_prefix(p, ¤t_log_level, &special); + if (plen) { + p += plen; + + switch (special) { + case 'c': /* Strip <c> KERN_CONT, continue line */ + plen = 0; + break; + case 'd': /* Strip <d> KERN_DEFAULT, start new line */ + plen = 0; + default: + if (!new_text_line) { + emit_log_char('\n'); + new_text_line = 1; } } } /* - * Copy the output into log_buf. If the caller didn't provide - * appropriate log level tags, we insert them here + * Copy the output into log_buf. If the caller didn't provide + * the appropriate log prefix, we insert them here */ - for ( ; *p; p++) { + for (; *p; p++) { if (new_text_line) { - /* Always output the token */ - emit_log_char('<'); - emit_log_char(current_log_level + '0'); - emit_log_char('>'); - printed_len += 3; new_text_line = 0; + if (plen) { + /* Copy original log prefix */ + int i; + + for (i = 0; i < plen; i++) + emit_log_char(printk_buf[i]); + printed_len += plen; + } else { + /* Add log prefix */ + emit_log_char('<'); + emit_log_char(current_log_level + '0'); + emit_log_char('>'); + printed_len += 3; + } + if (printk_time) { - /* Follow the token with the time */ + /* Add the current time stamp */ char tbuf[50], *tp; unsigned tlen; unsigned long long t; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1708b1e..e2302e4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } -int ptrace_attach(struct task_struct *task) +static int ptrace_attach(struct task_struct *task) { int retval; @@ -219,7 +219,7 @@ out: * Performs checks and sets PT_PTRACED. * Should be used by all ptrace implementations for PTRACE_TRACEME. */ -int ptrace_traceme(void) +static int ptrace_traceme(void) { int ret = -EPERM; @@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) return false; } -int ptrace_detach(struct task_struct *child, unsigned int data) +static int ptrace_detach(struct task_struct *child, unsigned int data) { bool dead = false; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a23a57a..f3240e9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * Note that the machinery to reliably determine whether + * or not we are in an RCU read-side critical section + * exists only in the preemptible RCU implementations + * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why + * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. */ -#ifndef CONFIG_PREEMPT - WARN_ON(1); - return 0; -#else if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { WARN_ON(1); @@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) rcu_barrier_bh(); debug_object_free(head, &rcuhead_debug_descr); return 1; -#endif default: return 0; } diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 015abae..3cb8e36 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -852,7 +852,7 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 89613f9..c224da4 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,7 +47,6 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <asm/byteorder.h> -#include <linux/sched.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54..3c7cbc2 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) put_pid(waiter->deadlock_task_pid); TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 66cb89b..5c9ccd3 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -9,7 +9,6 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/sysdev.h> #include <linux/timer.h> @@ -27,7 +26,6 @@ struct test_thread_data { int opcode; int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; - int bkl; int event; struct sys_device sysdev; }; @@ -46,9 +44,8 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ - RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + /* 9, 10 - reserved for BKL commemoration */ + RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ RTTEST_RESET = 99, /* 99 Reset all pending operations */ }; @@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[i] = 0; } } - - if (!lockwakeup && td->bkl == 4) { -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - } return 0; case RTTEST_RESETEVENT: @@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[id] = 0; return 0; - case RTTEST_LOCKBKL: - if (td->bkl) - return 0; - td->bkl = 1; -#ifdef CONFIG_LOCK_KERNEL - lock_kernel(); -#endif - td->bkl = 4; - return 0; - - case RTTEST_UNLOCKBKL: - if (td->bkl != 4) - break; -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - return 0; - default: break; } @@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); break; - case RTTEST_LOCKBKL: default: break; } @@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); return; - case RTTEST_LOCKBKL: - return; default: return; } @@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_lock(&rttest_lock); curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", td->opcode, td->event, tsk->state, (MAX_RT_PRIO - 1) - tsk->prio, (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on, td->bkl); + tsk->pi_blocked_on); for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) curr += sprintf(curr, "%d", td->mutexes[i]); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a960481..ab44911 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -20,41 +20,34 @@ /* * lock->owner state tracking: * - * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 - * are used to keep track of the "owner is pending" and "lock has - * waiters" state. + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. * - * owner bit1 bit0 - * NULL 0 0 lock is free (fast acquire possible) - * NULL 0 1 invalid state - * NULL 1 0 Transitional State* - * NULL 1 1 invalid state - * taskpointer 0 0 lock is held (fast release possible) - * taskpointer 0 1 task is pending owner - * taskpointer 1 0 lock is held and has waiters - * taskpointer 1 1 task is pending owner and lock has more waiters - * - * Pending ownership is assigned to the top (highest priority) - * waiter of the lock, when the lock is released. The thread is woken - * up and can now take the lock. Until the lock is taken (bit 0 - * cleared) a competing higher priority thread can steal the lock - * which puts the woken up thread back on the waiters list. + * owner bit0 + * NULL 0 lock is free (fast acquire possible) + * NULL 1 lock is free and has waiters and the top waiter + * is going to take the lock* + * taskpointer 0 lock is held (fast release possible) + * taskpointer 1 lock is held and has waiters** * * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 and 1 of lock->owner are 0. + * possible when bit 0 of lock->owner is 0. + * + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. * - * (*) There's a small time where the owner can be NULL and the - * "lock has waiters" bit is set. This can happen when grabbing the lock. - * To prevent a cmpxchg of the owner releasing the lock, we need to set this - * bit before looking at the lock, hence the reason this is a transitional - * state. + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. */ static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, - unsigned long mask) +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | mask; + unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; @@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * reached or the state of the chain has changed while we * dropped the locks. */ - if (!waiter || !waiter->task) + if (!waiter) goto out_unlock_pi; /* * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock - * and made us the pending owner: + * the previous owner of the lock might have released the lock. */ - if (orig_waiter && !orig_waiter->task) + if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; /* @@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!rt_mutex_owner(lock)) { + /* + * If the requeue above changed the top waiter, then we need + * to wake the new top waiter up to try to get the lock. + */ + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); + raw_spin_unlock(&lock->wait_lock); + goto out_put_task; + } put_task_struct(task); /* Grab the next task */ @@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * Optimization: check if we can steal the lock from the - * assigned pending owner [which might not have taken the - * lock yet]: - */ -static inline int try_to_steal_lock(struct rt_mutex *lock, - struct task_struct *task) -{ - struct task_struct *pendowner = rt_mutex_owner(lock); - struct rt_mutex_waiter *next; - unsigned long flags; - - if (!rt_mutex_owner_pending(lock)) - return 0; - - if (pendowner == task) - return 1; - - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - if (task->prio >= pendowner->prio) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 0; - } - - /* - * Check if a waiter is enqueued on the pending owners - * pi_waiters list. Remove it and readjust pending owners - * priority. - */ - if (likely(!rt_mutex_has_waiters(lock))) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 1; - } - - /* No chain handling, pending owner is not blocked on anything: */ - next = rt_mutex_top_waiter(lock); - plist_del(&next->pi_list_entry, &pendowner->pi_waiters); - __rt_mutex_adjust_prio(pendowner); - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - /* - * We are going to steal the lock and a waiter was - * enqueued on the pending owners pi_waiters queue. So - * we have to enqueue this waiter into - * task->pi_waiters list. This covers the case, - * where task is boosted because it holds another - * lock and gets unboosted because the booster is - * interrupted, so we would delay a waiter with higher - * priority as task->normal_prio. - * - * Note: in the rare case of a SCHED_OTHER task changing - * its priority and thus stealing the lock, next->task - * might be task: - */ - if (likely(next->task != task)) { - raw_spin_lock_irqsave(&task->pi_lock, flags); - plist_add(&next->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - } - return 1; -} - -/* * Try to take an rt-mutex * - * This fails - * - when the lock has a real owner - * - when a different pending owner exists and has higher priority than current - * * Must be called with lock->wait_lock held. + * + * @lock: the lock to be acquired. + * @task: the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { /* * We have to be careful here if the atomic speedups are @@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) + if (rt_mutex_owner(lock)) return 0; + /* + * It will get the lock because of one of these conditions: + * 1) there is no waiter + * 2) higher priority than waiters + * 3) it is top waiter + */ + if (rt_mutex_has_waiters(lock)) { + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } + } + + if (waiter || rt_mutex_has_waiters(lock)) { + unsigned long flags; + struct rt_mutex_waiter *top; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* remove the queued waiter. */ + if (waiter) { + plist_del(&waiter->list_entry, &lock->wait_list); + task->pi_blocked_on = NULL; + } + + /* + * We have to enqueue the top waiter(if it exists) into + * task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) { + top = rt_mutex_top_waiter(lock); + top->pi_list_entry.prio = top->list_entry.prio; + plist_add(&top->pi_list_entry, &task->pi_waiters); + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + } + /* We got the lock. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, current, 0); + rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, current); + rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!owner) + return 0; + if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); @@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and from - * the lock waiter list. Set it as pending owner. Then wake it up. + * Remove the top waiter from the current tasks waiter list and wake it up. * * Called with lock->wait_lock held. */ static void wakeup_next_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - struct task_struct *pendowner; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); - plist_del(&waiter->list_entry, &lock->wait_list); /* * Remove it from current->pi_waiters. We do not adjust a @@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * lock->wait_lock. */ plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - pendowner = waiter->task; - waiter->task = NULL; - rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + rt_mutex_set_owner(lock, NULL); raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - /* - * Clear the pi_blocked_on variable and enqueue a possible - * waiter into the pi_waiters list of the pending owner. This - * prevents that in case the pending owner gets unboosted a - * waiter with higher priority than pending-owner->normal_prio - * is blocked on the unboosted (pending) owner. - */ - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - - WARN_ON(!pendowner->pi_blocked_on); - WARN_ON(pendowner->pi_blocked_on != waiter); - WARN_ON(pendowner->pi_blocked_on->lock != lock); - - pendowner->pi_blocked_on = NULL; - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - wake_up_process(pendowner); + wake_up_process(waiter->task); } /* - * Remove a waiter from a lock + * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) @@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); - waiter->task = NULL; current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (first && owner != current) { + if (!owner) + return; + + if (first) { raw_spin_lock_irqsave(&owner->pi_lock, flags); @@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: passed to task_blocks_on_rt_mutex * * lock->wait_lock must be held by the caller. */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret = 0; for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) + if (try_to_take_rt_mutex(lock, current, waiter)) break; /* @@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - /* - * waiter->task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter->task) { - ret = task_blocks_on_rt_mutex(lock, waiter, current, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter->task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); - if (waiter->task) - schedule_rt_mutex(lock); + schedule_rt_mutex(lock); raw_spin_lock(&lock->wait_lock); set_current_state(state); @@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { + if (try_to_take_rt_mutex(lock, current, NULL)) { raw_spin_unlock(&lock->wait_lock); return 0; } @@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, - detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + + if (likely(!ret)) + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter.task)) + if (unlikely(ret)) remove_waiter(lock, &waiter); /* @@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(timeout)) hrtimer_cancel(&timeout->timer); - /* - * Readjust priority, when we did not get the lock. We might - * have been the pending owner and boosted. Since we did not - * take the lock, the PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - debug_rt_mutex_free_waiter(&waiter); return ret; @@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) if (likely(rt_mutex_owner(lock) != current)) { - ret = try_to_take_rt_mutex(lock); + ret = try_to_take_rt_mutex(lock, current, NULL); /* * try_to_take_rt_mutex() sets the lock waiters * bit unconditionally. Clean this up. @@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, { __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_set_owner(lock, proxy_owner); rt_mutex_deadlock_account_lock(lock, proxy_owner); } @@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_set_owner(lock, NULL); rt_mutex_deadlock_account_unlock(proxy_owner); } @@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock(&lock->wait_lock); - mark_rt_mutex_waiters(lock); - - if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { - /* We got the lock for task. */ - debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); + if (try_to_take_rt_mutex(lock, task, NULL)) { raw_spin_unlock(&lock->wait_lock); - rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { + if (ret && !rt_mutex_owner(lock)) { /* * Reset the return value. We might have * returned with -EDEADLK and the owner @@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } + + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, - detect_deadlock); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter->task)) + if (unlikely(ret)) remove_waiter(lock, waiter); /* @@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - /* - * Readjust priority, when we did not get the lock. We might have been - * the pending owner and boosted. Since we did not take the lock, the - * PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - return ret; } diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81..53a66c8 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) /* * lock->owner state tracking: */ -#define RT_MUTEX_OWNER_PENDING 1UL -#define RT_MUTEX_HAS_WAITERS 2UL -#define RT_MUTEX_OWNER_MASKALL 3UL +#define RT_MUTEX_HAS_WAITERS 1UL +#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { @@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); } -static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) -{ - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) -{ - return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; -} - /* * PI-futex support (proxy locking functions, etc.): */ diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4..a172494 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -32,7 +32,6 @@ #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> -#include <linux/smp_lock.h> #include <asm/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> @@ -324,7 +323,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last; + struct sched_entity *curr, *next, *last, *skip; unsigned int nr_spread_over; @@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; - if (p->flags & PF_EXITING) - return &root_task_group; - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq) #endif /** - * runqueue_is_locked + * runqueue_is_locked - Returns true if the current cpu runqueue is locked * @cpu: the processor in question. * - * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ @@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + #endif static void calc_load_account_idle(struct rq *this_rq); @@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr) */ if (hardirq_count()) __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); @@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) sched_rt_avg_update(rq, irq_delta); } +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#define sched_clock_irqtime (0) + static void update_rq_clock_task(struct rq *rq, s64 delta) { rq->clock_task += delta; @@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p) static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, - int oldprio, int running) + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); } static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) @@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } @@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); #endif /* CONFIG_SMP */ -/** - * task_oncpu_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - */ -void task_oncpu_function_call(struct task_struct *p, - void (*func) (void *info), void *info) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if (task_curr(p)) - smp_call_function_single(cpu, func, info, 1); - preempt_enable(); -} - #ifdef CONFIG_SMP /* * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. @@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; + p->se.vruntime = 0; #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -2776,9 +2819,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + trace_sched_switch(prev, next); } /** @@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_sched_switch(prev, next); + mm = next->mm; oldmm = prev->active_mm; /* @@ -3568,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, } /* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + cputime64_t tmp = cputime_to_cputime64(cputime); + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + cputime64_t *target_cputime64; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); return; } - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); + target_cputime64 = &cpustat->irq; else if (in_serving_softirq()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + target_cputime64 = &cpustat->softirq; else - cpustat->system = cputime64_add(cpustat->system, tmp); - - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + target_cputime64 = &cpustat->system; - /* Account for system time used */ - acct_update_integrals(p); + __account_system_time(p, cputime, cputime_scaled, target_cputime64); } /* * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait + * @cputime: the cpu time spent in involuntary wait */ void account_steal_time(cputime_t cputime) { @@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime) #ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + if (irqtime_account_hi_update()) { + cpustat->irq = cputime64_add(cpustat->irq, tmp); + } else if (irqtime_account_si_update()) { + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->softirq); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->system); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + /* * Account a single tick of cpu time. * @p: the process that the cpu time gets accounted to @@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + account_idle_time(jiffies_to_cputime(ticks)); } @@ -3945,9 +4085,6 @@ need_resched: rcu_note_context_switch(cpu); prev = rq->curr; - release_kernel_lock(prev); -need_resched_nonpreemptible: - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3989,9 +4126,6 @@ need_resched_nonpreemptible: rq->skip_clock_update = 0; if (likely(prev != next)) { - sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next); - rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -4010,9 +4144,6 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(prev))) - goto need_resched_nonpreemptible; - preempt_enable_no_resched(); if (need_resched()) goto need_resched; @@ -4213,6 +4344,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { __wake_up_common(q, mode, 1, 0, key); } +EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. @@ -4570,11 +4702,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, &flags); } @@ -4822,12 +4953,15 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } /* can't change other user's priorities */ if (!check_same_owner(p)) @@ -4902,11 +5036,10 @@ recheck: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) activate_task(rq, p, 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5323,6 +5456,65 @@ void __sched yield(void) } EXPORT_SYMBOL(yield); +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + bool yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out; + + if (curr->sched_class != p->sched_class) + goto out; + + if (task_running(p_rq, p) || p->state) + goto out; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) { + schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } + +out: + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. @@ -5571,7 +5763,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_graph_init_task(idle); + ftrace_graph_init_idle_task(idle, cpu); } /* @@ -7796,6 +7988,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; + /* allow initial update_cfs_load() to truncate */ +#ifdef CONFIG_SMP + cfs_rq->load_stamp = 1; +#endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } @@ -8074,7 +8270,7 @@ static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); - return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); + return (nested == preempt_offset); } void __might_sleep(const char *file, int line, int preempt_offset) @@ -8109,6 +8305,8 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ static void normalize_task(struct rq *rq, struct task_struct *p) { + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; int on_rq; on_rq = p->se.on_rq; @@ -8119,6 +8317,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) activate_task(rq, p, 0); resched_task(rq->curr); } + + check_class_changed(rq, p, prev_class, old_prio); } void normalize_rt_tasks(void) @@ -8510,7 +8710,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se), 0); + update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8884,7 +9084,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 9fb6562..5946ac5 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr; static void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; - root_task_group.autogroup = &autogroup_default; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; @@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) static inline bool task_group_is_autogroup(struct task_group *tg) { - return tg != &root_task_group && tg->autogroup; + return !!tg->autogroup; } static inline struct task_group * @@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) { struct autogroup *ag = autogroup_task_get(p); + if (!task_group_is_autogroup(ag->tg)) + goto out; + down_read(&ag->lock); seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); up_read(&ag->lock); +out: autogroup_kref_put(ag); } #endif /* CONFIG_PROC_FS */ @@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (!enabled || !tg->autogroup) + if (!task_group_is_autogroup(tg)) return 0; return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 7b859ff..0557705 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -1,6 +1,11 @@ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index eb6cb8e..7bacd83 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2d..3f7ec9e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8; unsigned int sysctl_sched_child_runs_first __read_mostly; /* - * sys_sched_yield() compat mode - * - * This option switches the agressive yield implementation of the - * old scheduler back on. - */ -unsigned int __read_mostly sysctl_sched_compat_yield; - -/* * SCHED_OTHER wake-up granularity. * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * @@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) return rb_entry(left, struct sched_entity, run_node); } +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -#ifdef CONFIG_SCHED_DEBUG int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* * Update the current task's runtime statistics. Skip current tasks that @@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) now - cfs_rq->load_last > 4 * period) { cfs_rq->load_period = 0; cfs_rq->load_avg = 0; + delta = period - 1; } cfs_rq->load_stamp = now; @@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { long load_weight, load, shares; - load = cfs_rq->load.weight + weight_delta; + load = cfs_rq->load.weight; load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; load_weight += load; + load_weight -= cfs_rq->load_contribution; shares = (tg->shares * load); if (load_weight) @@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) { if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } } # else /* CONFIG_SMP */ @@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } @@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; struct sched_entity *se; @@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) if (likely(se->load.weight == tg->shares)) return; #endif - shares = calc_cfs_shares(cfs_rq, tg, weight_delta); + shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } @@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } @@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) list_add_leaf_cfs_rq(cfs_rq); } -static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies_last(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} + +static void __clear_buddies_next(struct sched_entity *se) { - if (!se || cfs_rq->last == se) - cfs_rq->last = NULL; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } +} - if (!se || cfs_rq->next == se) - cfs_rq->next = NULL; +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - for_each_sched_entity(se) - __clear_buddies(cfs_rq_of(se), se); + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); } static void @@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta < 0) @@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } /* * Prefer last buddy, try to return the CPU to a preempted task. @@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + clear_buddies(cfs_rq, se); return se; @@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); } -/* - * sched_yield() support is very simple - we dequeue and enqueue. - * - * If compat_yield is turned on then we requeue to the end of the tree. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *rightmost, *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(cfs_rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); - - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - return; - } - /* - * Find the rightmost entry in the rbtree: - */ - rightmost = __pick_last_entity(cfs_rq); - /* - * Already in the rightmost position? - */ - if (unlikely(!rightmost || entity_before(rightmost, se))) - return; - - /* - * Minimally necessary key value to be last in the tree: - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - se->vruntime = rightmost->vruntime + 1; -} - #ifdef CONFIG_SMP static void task_waking_fair(struct rq *rq, struct task_struct *p) @@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se) } } +static void set_skip_buddy(struct sched_entity *se) +{ + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; + } +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1857,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (test_tsk_need_resched(curr)) return; + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + /* - * Batch and idle tasks do not preempt (their preemption is driven by - * the tick): + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): */ if (unlikely(p->policy != SCHED_NORMAL)) return; - /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) - goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; @@ -1932,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + } + + set_skip_buddy(se); +} + +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + if (!se->on_rq) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + yield_task_fair(rq); + + return true; +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: @@ -2123,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -2610,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @sd_idle: Idle status of the sched_domain containing group. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. @@ -2618,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct sched_domain *sd, struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, int *sd_idle, + enum cpu_idle_type idle, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { @@ -2638,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -2685,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. + * than the average weight of a task. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a @@ -2695,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); @@ -2755,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, int *sd_idle, - const struct cpumask *cpus, int *balance, - struct sd_lb_stats *sds) + enum cpu_idle_type idle, const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; struct sched_group *sg = sd->groups; @@ -2781,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) @@ -3033,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * @imbalance: Variable which stores amount of weighted load which should * be moved to restore balance/put a group to idle. * @idle: The idle status of this_cpu. - * @sd_idle: The idleness of sd * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. @@ -3046,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) + const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; @@ -3056,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, - balance, &sds); - - /* Cases where imbalance does not exist from POV of this_cpu */ - /* 1) this_cpu is not the appropriate cpu to perform load balancing - * at this level. - * 2) There is no busy sibling group to pull from. - * 3) This group is the busiest group. - * 4) This group is more busy than the avg busieness at this - * sched_domain. - * 5) The imbalance is within the specified limit. - * - * Note: when doing newidle balance, if the local group has excess - * capacity (i.e. nr_running < group_capacity) and the busiest group - * does not have any capacity, we force a load balance to pull tasks - * to the local group. In this case, we skip past checks 3, 4 and 5. + update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); + + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. */ if (!(*balance)) goto ret; @@ -3080,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, check_asym_packing(sd, &sds, this_cpu, imbalance)) return sds.busiest; + /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) goto force_balance; + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ if (sds.this_load >= sds.max_load) goto out_balanced; + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load) goto out_balanced; - /* - * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. - * And to check for busy balance use !idle_cpu instead of - * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE - * even when they are idle. - */ - if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; - } else { + if (idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && sds.busiest_nr_running <= sds.busiest_group_weight) goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; } force_balance: @@ -3193,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, +static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { @@ -3225,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, * move_tasks() will succeed. ld_moved will be true and this * active balance code will not be triggered. */ - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return 0; - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) return 0; } @@ -3246,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -3255,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + group = find_busiest_group(sd, this_cpu, &imbalance, idle, cpus, balance); if (*balance == 0) @@ -3330,8 +3372,7 @@ redo: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), - this_cpu)) { + if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3386,10 +3427,6 @@ redo: sd->balance_interval *= 2; } - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - goto out; out_balanced: @@ -3403,11 +3440,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; + ld_moved = 0; out: return ld_moved; } @@ -3831,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. + * longer idle. */ idle = CPU_NOT_IDLE; } @@ -4079,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p) * Priority of the task has changed. Check to see if we preempt * the current task. */ -static void prio_changed_fair(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { + if (!p->se.on_rq) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (rq->curr == p) { if (p->prio > oldprio) resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when its + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it was on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it was !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!se->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } +} + /* * We switched to the sched_fair class. */ -static void switched_to_fair(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_fair(struct rq *rq, struct task_struct *p) { + if (!p->se.on_rq) + return; + /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (running) + if (rq->curr == p) resched_task(rq->curr); else check_preempt_curr(rq, p, 0); @@ -4171,6 +4232,7 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, @@ -4191,6 +4253,7 @@ static const struct sched_class fair_sched_class = { .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f40..c82f26c1 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq) { } -static void switched_to_idle(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_idle(struct rq *rq, struct task_struct *p) { - /* Can this actually happen?? */ - if (running) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); + BUG(); } -static void prio_changed_idle(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) { - /* This can happen for hot plug CPUS */ - - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); + BUG(); } static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ad62677..db308cb 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - int this_cpu = smp_processor_id(); struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se; - rt_se = rt_rq->tg->rt_se[this_cpu]; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + + rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { if (rt_se && !on_rt_rq(rt_se)) @@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { - int this_cpu = smp_processor_id(); struct sched_rt_entity *rt_se; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); - rt_se = rt_rq->tg->rt_se[this_cpu]; + rt_se = rt_rq->tg->rt_se[cpu]; if (rt_se && on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); @@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); - } else if (rt_rq->rt_nr_running) + } else if (rt_rq->rt_nr_running) { idle = 0; + if (!rt_rq_throttled(rt_rq)) + enqueue = 1; + } if (enqueue) sched_rt_rq_enqueue(rt_rq); @@ -1595,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq) * When switch from the rt queue, we bring ourselves to a position * that we might want to pull RT tasks from other runqueues. */ -static void switched_from_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_from_rt(struct rq *rq, struct task_struct *p) { /* * If there are other RT tasks then we will reschedule @@ -1605,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, * we may need to handle the pulling of RT tasks * now. */ - if (!rq->rt.rt_nr_running) + if (p->se.on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1624,8 +1627,7 @@ static inline void init_sched_rt_class(void) * with RT tasks. In this case we try to push them off to * other runqueues. */ -static void switched_to_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_rt(struct rq *rq, struct task_struct *p) { int check_resched = 1; @@ -1636,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (!running) { + if (p->se.on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1652,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * Priority of the task has changed. This may cause * us to initiate a push or pull. */ -static void prio_changed_rt(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (running) { + if (!p->se.on_rq) + return; + + if (rq->curr == p) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47..84ec9bc 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } -static void prio_changed_stop(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) { BUG(); /* how!?, what priority? */ } diff --git a/kernel/smp.c b/kernel/smp.c index 9910744..7cbd0f2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - void (*func) (void *info); + smp_call_func_t func; /* * Since we walk the list without any locks, we might @@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void) if (atomic_read(&data->refs) == 0) continue; - func = data->csd.func; /* for later warn */ - data->csd.func(data->csd.info); + func = data->csd.func; /* save for later warn */ + func(data->csd.info); /* - * If the cpu mask is not still set then it enabled interrupts, - * we took another smp interrupt, and executed the function - * twice on this cpu. In theory that copy decremented refs. + * If the cpu mask is not still set then func enabled + * interrupts (BUG), and this cpu took another smp call + * function interrupt and executed func(info) twice + * on this cpu. That nested execution decremented refs. */ if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { - WARN(1, "%pS enabled interrupts and double executed\n", - func); + WARN(1, "%pf enabled interrupts and double executed\n", func); continue; } @@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask, { struct call_function_data *data; unsigned long flags; - int cpu, next_cpu, this_cpu = smp_processor_id(); + int refs, cpu, next_cpu, this_cpu = smp_processor_id(); /* * Can deadlock when called with interrupts disabled. @@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress && !early_boot_irqs_disabled); - /* So, what's a CPU they want? Ignoring this one. */ + /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); @@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + + /* This BUG_ON verifies our reuse assertions and can be removed */ BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); + /* + * The global call function queue list add and delete are protected + * by a lock, but the list is traversed without any lock, relying + * on the rcu list add and delete to allow safe concurrent traversal. + * We reuse the call function data without waiting for any grace + * period after some other cpu removes it from the global queue. + * This means a cpu might find our data block as it is being + * filled out. + * + * We hold off the interrupt handler on the other cpu by + * ordering our writes to the cpu mask vs our setting of the + * refs counter. We assert only the cpu owning the data block + * will set a bit in cpumask, and each bit will only be cleared + * by the subject cpu. Each cpu must first find its bit is + * set and then check that refs is set indicating the element is + * ready to be processed, otherwise it must skip the entry. + * + * On the previous iteration refs was set to 0 by another cpu. + * To avoid the use of transitivity, set the counter to 0 here + * so the wmb will pair with the rmb in the interrupt handler. + */ + atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ + data->csd.func = func; data->csd.info = info; - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); - /* - * To ensure the interrupt handler gets an complete view - * we order the cpumask and refs writes and order the read - * of them in the interrupt handler. In addition we may - * only clear our own cpu bit from the mask. - */ + /* Ensure 0 refs is visible before mask. Also orders func and info */ smp_wmb(); - atomic_set(&data->refs, cpumask_weight(data->cpumask)); + /* We rely on the "and" being processed before the store */ + cpumask_and(data->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, data->cpumask); + refs = cpumask_weight(data->cpumask); + + /* Some callers race with other cpus changing the passed mask */ + if (unlikely(!refs)) { + csd_unlock(&data->csd); + return; + } raw_spin_lock_irqsave(&call_function.lock, flags); /* @@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask, * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); + /* + * We rely on the wmb() in list_add_rcu to complete our writes + * to the cpumask before this write to refs, which indicates + * data is on the list and is ready to be processed. + */ + atomic_set(&data->refs, refs); raw_spin_unlock_irqrestore(&call_function.lock, flags); /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5ef..56e5dec 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", @@ -311,9 +311,21 @@ void irq_enter(void) } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -# define invoke_softirq() __do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + __do_softirq(); + else + wakeup_softirqd(); +} #else -# define invoke_softirq() do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + do_softirq(); + else + wakeup_softirqd(); +} #endif /* @@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); - current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { @@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu) don't process */ if (cpu_is_offline((long)__bind_cpu)) goto wait_to_die; - do_softirq(); + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_irq_enable(); preempt_enable_no_resched(); cond_resched(); preempt_disable(); diff --git a/kernel/sys.c b/kernel/sys.c index 18da702..1ad48b3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -37,6 +37,7 @@ #include <linux/ptrace.h> #include <linux/fs_struct.h> #include <linux/gfp.h> +#include <linux/syscore_ops.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -298,6 +299,7 @@ void kernel_restart_prepare(char *cmd) system_state = SYSTEM_RESTART; device_shutdown(); sysdev_shutdown(); + syscore_shutdown(); } /** @@ -336,6 +338,7 @@ void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); sysdev_shutdown(); + syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); machine_halt(); @@ -355,6 +358,7 @@ void kernel_power_off(void) pm_power_off_prepare(); disable_nonboot_cpus(); sysdev_shutdown(); + syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c782fe9..25cc41c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); + +/* open by handle */ +cond_syscall(sys_name_to_handle_at); +cond_syscall(sys_open_by_handle_at); +cond_syscall(compat_sys_open_by_handle_at); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83..40245d69 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -194,9 +194,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { - .count = 1, + {{.count = 1, .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }; @@ -361,20 +361,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, - { - .procname = "sched_compat_yield", - .data = &sysctl_sched_compat_yield, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, @@ -948,7 +941,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = perf_proc_update_handler, }, #endif #ifdef CONFIG_KMEMCHECK @@ -1567,11 +1560,16 @@ void sysctl_head_get(struct ctl_table_header *head) spin_unlock(&sysctl_lock); } +static void free_head(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ctl_table_header, rcu)); +} + void sysctl_head_put(struct ctl_table_header *head) { spin_lock(&sysctl_lock); if (!--head->count) - kfree(head); + call_rcu(&head->rcu, free_head); spin_unlock(&sysctl_lock); } @@ -1685,13 +1683,8 @@ static int test_perm(int mode, int op) int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) { - int error; int mode; - error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); - if (error) - return error; - if (root->permissions) mode = root->permissions(root, current->nsproxy, table); else @@ -1948,10 +1941,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) start_unregistering(header); if (!--header->parent->count) { WARN_ON(1); - kfree(header->parent); + call_rcu(&header->parent->rcu, free_head); } if (!--header->count) - kfree(header); + call_rcu(&header->rcu, free_head); spin_unlock(&sysctl_lock); } diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bed..3b8e028 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { const struct bin_table *table = NULL; - struct nameidata nd; struct vfsmount *mnt; struct file *file; ssize_t result; char *pathname; int flags; - int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, /* How should the sysctl be accessed? */ if (oldval && oldlen && newval && newlen) { flags = O_RDWR; - acc_mode = MAY_READ | MAY_WRITE; } else if (newval && newlen) { flags = O_WRONLY; - acc_mode = MAY_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; - acc_mode = MAY_READ; } else { result = 0; goto out_putname; } mnt = current->nsproxy->pid_ns->proc_mnt; - result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); - if (result) - goto out_putname; - - result = may_open(&nd.path, acc_mode, flags); - if (result) - goto out_putpath; - - file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; @@ -1370,10 +1357,6 @@ out_putname: putname(pathname); out: return result; - -out_putpath: - path_put(&nd.path); - goto out_putname; } diff --git a/kernel/time.c b/kernel/time.c index 3217435..8e8dc6d 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -150,7 +150,7 @@ static inline void warp_clock(void) * various programs will get confused when the clock gets warped. */ -int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; @@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) } /** - * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * @@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years */ -unsigned long nsecs_to_jiffies(u64 n) +u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ @@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n) #endif } -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) { - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; + return (unsigned long)nsecs_to_jiffies64(n); } -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); /* * Add two timespec values and do a safety check for overflow. diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee26662..b042599 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,5 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timeconv.o posix-clock.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fd..0d74b9b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,7 +18,6 @@ #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysdev.h> -#include <linux/tick.h> #include "tick-internal.h" diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a84..b2fa506 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -22,8 +22,11 @@ ************************************************************************/ #include <linux/clocksource.h> #include <linux/jiffies.h> +#include <linux/module.h> #include <linux/init.h> +#include "tick-internal.h" + /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on * all systems. It has the same coarse resolution as @@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + static int __init init_jiffies_clocksource(void) { return clocksource_register(&clocksource_jiffies); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5c00242..5f1bb8e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,6 +16,8 @@ #include <linux/mm.h> #include <linux/module.h> +#include "tick-internal.h" + /* * NTP timekeeping variables: */ @@ -646,6 +648,17 @@ int do_adjtimex(struct timex *txc) hrtimer_cancel(&leap_timer); } + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + result = timekeeping_inject_offset(&delta); + if (result) + return result; + } + getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 0000000..25028dd --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,451 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/device.h> +#include <linux/file.h> +#include <linux/mutex.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + mutex_lock(&clk->mutex); + + if (!clk->zombie) + return clk; + + mutex_unlock(&clk->mutex); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + mutex_unlock(&clk->mutex); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + mutex_lock(&clk->mutex); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + mutex_unlock(&clk->mutex); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + mutex_init(&clk->mutex); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + if (err) + goto no_cdev; + + return err; +no_cdev: + mutex_destroy(&clk->mutex); + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + mutex_destroy(&clk->mutex); + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + mutex_lock(&clk->mutex); + clk->zombie = true; + mutex_unlock(&clk->mutex); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761..da800ff 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include "tick-internal.h" @@ -600,4 +599,14 @@ int tick_broadcast_oneshot_active(void) return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; } +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 051bc80..119528d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include <asm/irq_regs.h> @@ -51,7 +50,11 @@ int tick_is_oneshot_available(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return 0; + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 1; + return tick_broadcast_oneshot_available(); } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefb..1009b06 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast(int cpu); +bool tick_broadcast_oneshot_available(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline bool tick_broadcast_oneshot_available(void) { return true; } # endif /* !BROADCAST */ #else /* !ONESHOT */ @@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) return 0; } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline bool tick_broadcast_oneshot_available(void) { return false; } #endif /* !TICK_ONESHOT */ /* @@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) { return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } + +#endif + +extern void do_timer(unsigned long ticks); +extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5cbc101..2d04411 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include "tick-internal.h" diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c55ea24..d5097c4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,7 +19,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include <linux/module.h> #include <asm/irq_regs.h> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d27c756..3bd7e3d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(struct timespec *tv) +int do_settimeofday(const struct timespec *tv) { struct timespec ts_delta; unsigned long flags; @@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv) EXPORT_SYMBOL(do_settimeofday); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ + unsigned long flags; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + timekeeping_forward_now(); + + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + + timekeeper.ntp_error = 0; + ntp_clear(); + + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + /** * change_clocksource - Swaps clocksources if a new one is available * @@ -779,7 +815,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) * * Called from the timer interrupt, must hold a write on xtime_lock. */ -void update_wall_time(void) +static void update_wall_time(void) { struct clocksource *clock; cycle_t offset; @@ -871,7 +907,7 @@ void update_wall_time(void) * getboottime - Return the real time of system boot. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Returns the wall-time of boot in a timespec. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which @@ -889,6 +925,55 @@ void getboottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(getboottime); + +/** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts: pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ + struct timespec tomono, sleep; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + sleep = total_sleep_time; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ + struct timespec ts; + + get_monotonic_boottime(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); + /** * monotonic_to_bootbased - Convert the monotonic time to boot based. * @ts: pointer to the timespec to be converted @@ -910,11 +995,6 @@ struct timespec __current_kernel_time(void) return xtime; } -struct timespec __get_wall_to_monotonic(void) -{ - return wall_to_monotonic; -} - struct timespec current_kernel_time(void) { struct timespec now; @@ -946,3 +1026,48 @@ struct timespec get_monotonic_coarse(void) now.tv_nsec + mono.tv_nsec); return now; } + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_wall_time(); + calc_global_load(ticks); +} + +/** + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + * and sleep offsets. + * @xtim: pointer to timespec to be set with xtime + * @wtom: pointer to timespec to be set with wall_to_monotonic + * @sleep: pointer to timespec to be set with time in suspend + */ +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep) +{ + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + *xtim = xtime; + *wtom = wall_to_monotonic; + *sleep = total_sleep_time; + } while (read_seqretry(&xtime_lock, seq)); +} + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&xtime_lock); + do_timer(ticks); + write_sequnlock(&xtime_lock); +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 32a19f9..3258455 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym) char symname[KSYM_NAME_LEN]; if (lookup_symbol_name((unsigned long)sym, symname) < 0) - SEQ_printf(m, "<%p>", sym); + SEQ_printf(m, "<%pK>", sym); else SEQ_printf(m, "%s", symname); } @@ -112,7 +112,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %p\n", base); + SEQ_printf(m, " .base: %pK\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", diff --git a/kernel/timer.c b/kernel/timer.c index d53ce66..fd61986 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} static struct debug_obj_descr timer_debug_descr; +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", + .debug_hint = timer_debug_hint, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, @@ -959,11 +965,30 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * hardirq contexts. The caller must not hold locks which would prevent + * interrupt contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. * + * Note: You must not hold locks that are held in interrupt context + * while calling this function. Even if the lock has nothing to do + * with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * <IRQ> + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * * The function returns whether it has deactivated a pending timer or not. */ int del_timer_sync(struct timer_list *timer) @@ -971,12 +996,14 @@ int del_timer_sync(struct timer_list *timer) #ifdef CONFIG_LOCKDEP unsigned long flags; - raw_local_irq_save(flags); - local_bh_disable(); + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ + local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); - _local_bh_enable(); - raw_local_irq_restore(flags); + local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it @@ -1297,19 +1324,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ - -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_wall_time(); - calc_global_load(ticks); -} - #ifdef __ARCH_WANT_SYS_ALARM /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d95721f..cbafed7 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1827,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i] = '\0'; } -void blk_fill_rwbs_rq(char *rwbs, struct request *rq) -{ - int rw = rq->cmd_flags & 0x03; - int bytes; - - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - - bytes = blk_rq_bytes(rq); - - blk_fill_rwbs(rwbs, rw, bytes); -} - #endif /* CONFIG_EVENT_TRACING */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae..888b611 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) /* The cpu_boot init_task->ret_stack will never be freed */ for_each_online_cpu(cpu) { if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_task(idle_task(cpu)); + ftrace_graph_init_idle_task(idle_task(cpu), cpu); } do { @@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) mutex_unlock(&ftrace_lock); } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { @@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; + graph_init_task(t, ret_stack); } } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6ee56b4..d9c8bca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5,7 +5,6 @@ */ #include <linux/ring_buffer.h> #include <linux/trace_clock.h> -#include <linux/ftrace_irq.h> #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/uaccess.h> @@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { @@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable; +#endif WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb..9541c27 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,8 +41,6 @@ #include "trace.h" #include "trace_output.h" -#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -425,6 +423,7 @@ static const char *trace_options[] = { "sleep-time", "graph-time", "record-cmd", + "overwrite", NULL }; @@ -780,6 +779,11 @@ __acquires(kernel_lock) tracing_reset_online_cpus(tr); current_trace = type; + + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, trace_buf_size); + /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); @@ -792,6 +796,10 @@ __acquires(kernel_lock) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); + /* Shrink the max buffer again */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, 1); + printk(KERN_CONT "PASSED\n"); } #endif @@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1749,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| /_--=> lock-depth \n"); - seq_puts(m, "# |||||/ delay \n"); - seq_puts(m, "# cmd pid |||||| time | caller \n"); - seq_puts(m, "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -2529,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) + ring_buffer_change_overwrite(global_trace.buffer, enabled); } static ssize_t @@ -2710,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (tracer_enabled ^ val) { + + /* Only need to warn if this is used to change the state */ + WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + if (val) { tracer_enabled = 1; if (current_trace->start) @@ -4551,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) __init static int tracer_alloc_buffers(void) { int ring_buf_size; + enum ring_buffer_flags rb_flags; int i; int ret = -ENOMEM; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -4566,12 +4581,13 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); @@ -4581,7 +4597,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, rb_flags); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c..5e9dfc6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,8 +272,8 @@ struct tracer { /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; - int print_max; struct tracer_flags *flags; + int print_max; int use_max_tr; }; @@ -606,6 +606,7 @@ enum trace_iterator_flags { TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, }; /* @@ -661,8 +662,10 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; - struct filter_pred **preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ + struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -674,11 +677,23 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 + struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); @@ -700,11 +715,23 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf2237..1516cb3 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, */ #define FTRACE_CTX_FIELDS \ __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ __field( unsigned char, prev_prio ) \ __field( unsigned char, prev_state ) \ - __field( unsigned int, next_pid ) \ __field( unsigned char, next_prio ) \ - __field( unsigned char, next_state ) \ - __field( unsigned int, next_cpu ) + __field( unsigned char, next_state ) FTRACE_ENTRY(context_switch, ctx_switch_entry, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e04..e88f74f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, lock_depth); return ret; } @@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) { return __ftrace_set_clr_event(NULL, system, event, set); } +EXPORT_SYMBOL_GPL(trace_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d4010..3249b4f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,9 +123,13 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int type; + int match; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; + struct filter_pred *preds; struct filter_pred *pred; - int i; + struct filter_pred *root; + int n_preds; + int done = 0; + + /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + + if (!n_preds) + return 1; + + /* + * n_preds, root and filter->preds are protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; + + pred = root; - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else { + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + + return __push_pred_stack(stack, dest); } -static void filter_disable_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; - call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void filter_disable(struct ftrace_event_call *call) { - int i; + call->flags &= ~TRACE_EVENT_FL_FILTERED; +} +static void __free_filter(struct event_filter *filter) +{ if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ struct filter_pred *pred; int i; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); + if (filter->preds) + __free_preds(filter); - filter->n_preds = 0; + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!filter->preds) - goto oom; + return -ENOMEM; - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + filter->a_preds = n_preds; + filter->n_preds = 0; + + for (i = 0; i < n_preds; i++) { + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } - return filter; - -oom: - __free_preds(filter); - return ERR_PTR(-ENOMEM); -} - -static int init_preds(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - return 0; } -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; - int err; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); - if (err) - return err; + filter_disable(call); + remove_filter_string(call->filter); } - - return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system) { struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - - filter_disable_preds(call); - remove_filter_string(call->filter); + __free_filter(call->filter); + call->filter = NULL; } } @@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; - fn = filter_pred_and; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; - fn = filter_pred_or; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call, { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; + struct filter_pred *root; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + err = check_preds(ps); if (err) return err; + if (!dry_run) { + err = __alloc_pred_stack(&stack, n_preds); + if (err) + return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; + } + + n_preds = 0; list_for_each_entry(elt, &ps->postfix, list) { if (elt->op == OP_NONE) { if (!operand1) @@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } - if (n_preds++ == MAX_FILTER_PRED) { + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + static int replace_system_preds(struct event_subsystem *system, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; if (strcmp(call->class->system, system->name) != 0) continue; - /* try to see if the filter can be applied */ - err = replace_preds(call, filter, ps, filter_string, true); + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); if (err) + goto fail; + } + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter; + + if (strcmp(call->class->system, system->name) != 0) continue; - /* really apply the filter */ - filter_disable_preds(call); - err = replace_preds(call, filter, ps, filter_string, false); + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); + + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; + + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); if (err) - filter_disable_preds(call); - else { + goto fail_mem; + + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; - replace_filter_string(filter, filter_string); - } + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + call->filter = filter_item->filter; + filter_item->filter = filter; + fail = false; } - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; + if (fail) + goto fail; + + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); } return 0; + fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; mutex_lock(&event_mutex); - err = init_preds(call); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable_preds(call); - remove_filter_string(call->filter); + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + call->filter = NULL; + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_filter(filter); goto out_unlock; } @@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!ps) goto out_unlock; - filter_disable_preds(call); - replace_filter_string(call->filter, filter_string); + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err) { - append_filter_err(ps, call->filter); + append_filter_err(ps, filter); goto out; } - err = replace_preds(call, call->filter, ps, filter_string, false); - if (err) - append_filter_err(ps, call->filter); - else + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + call->filter = filter; + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); @@ -1334,18 +1880,21 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; mutex_lock(&event_mutex); - err = init_subsystem_preds(system); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); goto out_unlock; } @@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - replace_filter_string(system->filter, filter_string); + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); @@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event) struct event_filter *filter = event->filter; event->filter = NULL; - __free_preds(filter); + __free_filter(filter); } int ftrace_profile_set_filter(struct perf_event *event, int event_id, @@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_preds(); - if (IS_ERR(filter)) { + filter = __alloc_filter(); + if (!filter) { err = PTR_ERR(filter); goto out_unlock; } @@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - goto free_preds; + goto free_filter; parse_init(ps, filter_ops, filter_str); err = filter_parse(ps); @@ -1435,9 +1994,9 @@ free_ps: postfix_clear(ps); kfree(ps); -free_preds: +free_filter: if (err) - __free_preds(filter); + __free_filter(filter); out_unlock: mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bc..8435b43 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) kfree(data); } +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static __kprobes void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -367,6 +404,7 @@ enum { FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, + FETCH_MTD_bitfield, FETCH_MTD_END, }; @@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ } \ } @@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type) if (!type) type = DEFAULT_FETCH_TYPE_STR; + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + type = strchr(type, '/'); + if (!type) + goto fail; + type++; + if (strict_strtoul(type, 0, &bs)) + goto fail; + switch (bs) { + case 8: + return find_fetch_type("u8"); + case 16: + return find_fetch_type("u16"); + case 32: + return find_fetch_type("u32"); + case 64: + return find_fetch_type("u64"); + default: + goto fail; + } + } + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) if (strcmp(type, fetch_type_table[i].name) == 0) return &fetch_type_table[i]; +fail: return NULL; } @@ -586,7 +649,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); @@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, } break; case '+': /* deref memory */ + arg++; /* Skip '+', because strict_strtol() rejects it. */ case '-': tmp = strchr(arg, '('); if (!tmp) break; *tmp = '\0'; - ret = strict_strtol(arg + 1, 0, &offset); + ret = strict_strtol(arg, 0, &offset); if (ret) break; - if (arg[0] == '-') - offset = -offset; arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { @@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, return ret; } +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + /* String length checking wrapper */ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) @@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0 && t != NULL) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, parg->fetch.fn); @@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf) return ret; } -#define WRITE_BUFSIZE 128 +#define WRITE_BUFSIZE 4096 static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272ba..456be90 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) * @entry: The trace entry field from the ring buffer * * Prints the generic fields of irqs off, in hard or softirq, preempt - * count and lock depth. + * count. */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - int hardirq, softirq; + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + need_resched = + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + hardsoft_irq = + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.'; + if (!trace_seq_printf(s, "%c%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? - 'X' : '.', - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? - 'N' : '.', - (hardirq && softirq) ? 'H' : - hardirq ? 'h' : softirq ? 's' : '.')) + irqs_off, need_resched, hardsoft_irq)) return 0; if (entry->preempt_count) @@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) else ret = trace_seq_putc(s, '.'); - if (!ret) - return 0; - - if (entry->lock_depth < 0) - return trace_seq_putc(s, '.'); - - return trace_seq_printf(s, "%d", entry->lock_depth); + return ret; } static int diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d0..7e62c0a 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) ctx_trace = tr; } -static void stop_sched_trace(struct trace_array *tr) -{ - tracing_stop_sched_switch_record(); -} - -static int sched_switch_trace_init(struct trace_array *tr) -{ - ctx_trace = tr; - tracing_reset_online_cpus(tr); - tracing_start_sched_switch_record(); - return 0; -} - -static void sched_switch_trace_reset(struct trace_array *tr) -{ - if (sched_ref) - stop_sched_trace(tr); -} - -static void sched_switch_trace_start(struct trace_array *tr) -{ - sched_stopped = 0; -} - -static void sched_switch_trace_stop(struct trace_array *tr) -{ - sched_stopped = 1; -} - -static struct tracer sched_switch_trace __read_mostly = -{ - .name = "sched_switch", - .init = sched_switch_trace_init, - .reset = sched_switch_trace_reset, - .start = sched_switch_trace_start, - .stop = sched_switch_trace_stop, - .wait_pipe = poll_wait_pipe, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sched_switch, -#endif -}; - -__init static int init_sched_switch_trace(void) -{ - return register_tracer(&sched_switch_trace); -} -device_initcall(init_sched_switch_trace); - diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5c9fe08..ee7b5a0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[]; static struct syscall_metadata **syscalls_metadata; +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall) stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + for ( ; start < stop; start++) { - /* - * Only compare after the "sys" prefix. Archs that use - * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted - * mismatch. - */ - if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; } return NULL; @@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; @@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int init_syscall_trace(struct ftrace_event_call *call) { int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } if (set_syscall_print_fmt(call) < 0) return -ENOMEM; @@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } -unsigned long __init arch_syscall_addr(int nr) +unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f37f974..18bb157 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -363,8 +363,14 @@ static int watchdog_nmi_enable(int cpu) goto out_save; } - printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); + else + printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 11869fa..5ca7ce9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -79,7 +79,9 @@ enum { MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ - MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ + MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, + /* call for help after 10ms + (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ @@ -249,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly; struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); +EXPORT_SYMBOL_GPL(system_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -314,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, static struct debug_obj_descr work_debug_descr; +static void *work_debug_hint(void *addr) +{ + return ((struct work_struct *) addr)->func; +} + /* * fixup_init is called when: * - an active object is initialized @@ -385,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", + .debug_hint = work_debug_hint, .fixup_init = work_fixup_init, .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, @@ -2047,6 +2057,15 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); + + /* + * Leave this gcwq. If keep_working() is %true, notify a + * regular worker; otherwise, we end up with 0 concurrency + * and stalling the execution. + */ + if (keep_working(gcwq)) + wake_up_worker(gcwq); + spin_unlock_irq(&gcwq->lock); } @@ -2956,7 +2975,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, */ spin_lock(&workqueue_lock); - if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) + if (workqueue_freezing && wq->flags & WQ_FREEZABLE) for_each_cwq_cpu(cpu, wq) get_cwq(cpu, wq)->max_active = 0; @@ -3068,7 +3087,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) spin_lock_irq(&gcwq->lock); - if (!(wq->flags & WQ_FREEZEABLE) || + if (!(wq->flags & WQ_FREEZABLE) || !(gcwq->flags & GCWQ_FREEZING)) get_cwq(gcwq->cpu, wq)->max_active = max_active; @@ -3318,7 +3337,7 @@ static int __cpuinit trustee_thread(void *__gcwq) * want to get it over with ASAP - spam rescuers, wake up as * many idlers as necessary and create new ones till the * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezeable cwqs. Don't declare + * may be frozen works in freezable cwqs. Don't declare * completion while frozen. */ while (gcwq->nr_workers != gcwq->nr_idle || @@ -3576,9 +3595,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu); /** * freeze_workqueues_begin - begin freezing workqueues * - * Start freezing workqueues. After this function returns, all - * freezeable workqueues will queue new works to their frozen_works - * list instead of gcwq->worklist. + * Start freezing workqueues. After this function returns, all freezable + * workqueues will queue new works to their frozen_works list instead of + * gcwq->worklist. * * CONTEXT: * Grabs and releases workqueue_lock and gcwq->lock's. @@ -3604,7 +3623,7 @@ void freeze_workqueues_begin(void) list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (cwq && wq->flags & WQ_FREEZEABLE) + if (cwq && wq->flags & WQ_FREEZABLE) cwq->max_active = 0; } @@ -3615,7 +3634,7 @@ void freeze_workqueues_begin(void) } /** - * freeze_workqueues_busy - are freezeable workqueues still busy? + * freeze_workqueues_busy - are freezable workqueues still busy? * * Check whether freezing is complete. This function must be called * between freeze_workqueues_begin() and thaw_workqueues(). @@ -3624,8 +3643,8 @@ void freeze_workqueues_begin(void) * Grabs and releases workqueue_lock. * * RETURNS: - * %true if some freezeable workqueues are still busy. %false if - * freezing is complete. + * %true if some freezable workqueues are still busy. %false if freezing + * is complete. */ bool freeze_workqueues_busy(void) { @@ -3645,7 +3664,7 @@ bool freeze_workqueues_busy(void) list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + if (!cwq || !(wq->flags & WQ_FREEZABLE)) continue; BUG_ON(cwq->nr_active < 0); @@ -3690,7 +3709,7 @@ void thaw_workqueues(void) list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + if (!cwq || !(wq->flags & WQ_FREEZABLE)) continue; /* restore max_active and repopulate worklist */ @@ -3764,8 +3783,10 @@ static int __init init_workqueues(void) system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); + system_freezable_wq = alloc_workqueue("events_freezable", + WQ_FREEZABLE, 0); BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq); + !system_unbound_wq || !system_freezable_wq); return 0; } early_initcall(init_workqueues); |