diff options
author | James Morris <james.l.morris@oracle.com> | 2012-08-17 20:42:30 +1000 |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2012-08-17 20:42:30 +1000 |
commit | 51b743fe87d7fb3dba7a2ff4a1fe23bb65dc2245 (patch) | |
tree | f8b8f601713a3ecb264eb9f145636343d9350520 /kernel | |
parent | 9f99798ff49e73dded73a8c674044ea6fb6af651 (diff) | |
parent | d9875690d9b89a866022ff49e3fcea892345ad92 (diff) | |
download | op-kernel-dev-51b743fe87d7fb3dba7a2ff4a1fe23bb65dc2245.zip op-kernel-dev-51b743fe87d7fb3dba7a2ff4a1fe23bb65dc2245.tar.gz |
Merge tag 'v3.6-rc2' into next
Linux 3.6-rc2
Resync with Linus.
Diffstat (limited to 'kernel')
79 files changed, 4458 insertions, 3071 deletions
diff --git a/kernel/async.c b/kernel/async.c index bd0c168..9d31183 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 static LIST_HEAD(async_pending); -static LIST_HEAD(async_running); +static ASYNC_DOMAIN(async_running); +static LIST_HEAD(async_domains); static DEFINE_SPINLOCK(async_lock); +static DEFINE_MUTEX(async_register_mutex); struct async_entry { struct list_head list; @@ -71,7 +73,7 @@ struct async_entry { async_cookie_t cookie; async_func_ptr *func; void *data; - struct list_head *running; + struct async_domain *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); @@ -82,13 +84,12 @@ static atomic_t entry_count; /* * MUST be called with the lock held! */ -static async_cookie_t __lowest_in_progress(struct list_head *running) +static async_cookie_t __lowest_in_progress(struct async_domain *running) { struct async_entry *entry; - if (!list_empty(running)) { - entry = list_first_entry(running, - struct async_entry, list); + if (!list_empty(&running->domain)) { + entry = list_first_entry(&running->domain, typeof(*entry), list); return entry->cookie; } @@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; /* "infinity" value */ } -static async_cookie_t lowest_in_progress(struct list_head *running) +static async_cookie_t lowest_in_progress(struct async_domain *running) { unsigned long flags; async_cookie_t ret; @@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; + struct async_domain *running = entry->running; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, entry->running); + list_move_tail(&entry->list, &running->domain); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ @@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); + if (running->registered && --running->count == 0) + list_del_init(&running->node); /* 4) free the entry */ kfree(entry); @@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) { struct async_entry *entry; unsigned long flags; @@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l spin_lock_irqsave(&async_lock, flags); newcookie = entry->cookie = next_cookie++; list_add_tail(&entry->list, &async_pending); + if (running->registered && running->count++ == 0) + list_add_tail(&running->node, &async_domains); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule); * Note: This function may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *running) + struct async_domain *running) { return __async_schedule(ptr, data, running); } @@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { + mutex_lock(&async_register_mutex); do { - async_synchronize_cookie(next_cookie); - } while (!list_empty(&async_running) || !list_empty(&async_pending)); + struct async_domain *domain = NULL; + + spin_lock_irq(&async_lock); + if (!list_empty(&async_domains)) + domain = list_first_entry(&async_domains, typeof(*domain), node); + spin_unlock_irq(&async_lock); + + async_synchronize_cookie_domain(next_cookie, domain); + } while (!list_empty(&async_domains)); + mutex_unlock(&async_register_mutex); } EXPORT_SYMBOL_GPL(async_synchronize_full); /** + * async_unregister_domain - ensure no more anonymous waiters on this domain + * @domain: idle domain to flush out of any async_synchronize_full instances + * + * async_synchronize_{cookie|full}_domain() are not flushed since callers + * of these routines should know the lifetime of @domain + * + * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing + */ +void async_unregister_domain(struct async_domain *domain) +{ + mutex_lock(&async_register_mutex); + spin_lock_irq(&async_lock); + WARN_ON(!domain->registered || !list_empty(&domain->node) || + !list_empty(&domain->domain)); + domain->registered = 0; + spin_unlock_irq(&async_lock); + mutex_unlock(&async_register_mutex); +} +EXPORT_SYMBOL_GPL(async_unregister_domain); + +/** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @list: running list to synchronize on + * @domain: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list have been done. + * synchronization domain specified by the running list @domain have been done. */ -void async_synchronize_full_domain(struct list_head *list) +void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); @@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); * @running: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list submitted + * synchronization domain specified by running list @running submitted * prior to @cookie have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) { ktime_t uninitialized_var(starttime), delta, endtime; + if (!running) + return; + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); diff --git a/kernel/audit.c b/kernel/audit.c index 1c7f2c6..ea3b7b6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb) static void audit_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - char *data = NLMSG_DATA(nlh); + char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE) { if (printk_ratelimit()) @@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, if (!skb) return NULL; - nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); - data = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, pid, seq, t, size, flags); + if (!nlh) + goto out_kfree_skb; + data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; -nlmsg_failure: /* Used by NLMSG_NEW */ - if (skb) - kfree_skb(skb); +out_kfree_skb: + kfree_skb(skb); return NULL; } @@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; - data = NLMSG_DATA(nlh); + data = nlmsg_data(nlh); switch (msg_type) { case AUDIT_GET: @@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb) static int __init audit_init(void) { int i; + struct netlink_kernel_cfg cfg = { + .input = audit_receive, + }; if (audit_initialized == AUDIT_DISABLED) return 0; printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, + THIS_MODULE, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else @@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) - goto nlmsg_failure; + goto err; - nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); + if (!nlh) + goto out_kfree_skb; return ab; -nlmsg_failure: /* Used by NLMSG_NEW */ +out_kfree_skb: kfree_skb(ab->skb); ab->skb = NULL; err: @@ -1450,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key) } /** + * audit_log_link_denied - report a link restriction denial + * @operation: specific link opreation + * @link: the path that triggered the restriction + */ +void audit_log_link_denied(const char *operation, struct path *link) +{ + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_ANOM_LINK); + audit_log_format(ab, "op=%s action=denied", operation); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_d_path(ab, " path=", link); + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); + audit_log_end(ab); +} + +/** * audit_log_end - end one audit record * @ab: the audit_buffer * diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 5bf0790..3a5ca58 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -595,7 +595,7 @@ void audit_trim_trees(void) root_mnt = collect_mounts(&path); path_put(&path); - if (!root_mnt) + if (IS_ERR(root_mnt)) goto skip_it; spin_lock(&hash_lock); @@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule) goto Err; mnt = collect_mounts(&path); path_put(&path); - if (!mnt) { - err = -ENOMEM; + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto Err; } @@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new) return err; tagged = collect_mounts(&path2); path_put(&path2); - if (!tagged) - return -ENOMEM; + if (IS_ERR(tagged)) + return PTR_ERR(tagged); err = kern_path(old, 0, &path1); if (err) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e683869..3823281 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata nd; - struct dentry *d; - int err; - - err = kern_path_parent(watch->path, &nd); - if (err) - return err; - - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); - return -EINVAL; - } - - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); - if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); + struct dentry *d = kern_path_locked(watch->path, parent); + if (IS_ERR(d)) return PTR_ERR(d); - } + mutex_unlock(&parent->dentry->d_inode->i_mutex); if (d->d_inode) { /* update watch filter fields */ watch->dev = d->d_inode->i_sb->s_dev; watch->ino = d->d_inode->i_ino; } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - - *parent = nd.path; dput(d); return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 72fcd30..7981850 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +static int css_unbias_refcnt(int refcnt) +{ + return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; +} + /* the current nr of refs, always >= 0 whether @css is deactivated or not */ static int css_refcnt(struct cgroup_subsys_state *css) { int v = atomic_read(&css->refcnt); - return v >= 0 ? v : v - CSS_DEACT_BIAS; + return css_unbias_refcnt(v); } /* convenient tests for these bits */ @@ -817,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -896,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) mutex_unlock(&cgroup_mutex); /* - * We want to drop the active superblock reference from the - * cgroup creation after all the dentry refs are gone - - * kill_sb gets mighty unhappy otherwise. Mark - * dentry->d_fsdata with cgroup_diput() to tell - * cgroup_d_release() to call deactivate_super(). + * Drop the active superblock reference that we took when we + * created the cgroup */ - dentry->d_fsdata = cgroup_diput; + deactivate_super(cgrp->root->sb); /* * if we're getting rid of the cgroup, refcount should ensure @@ -928,13 +930,6 @@ static int cgroup_delete(const struct dentry *d) return 1; } -static void cgroup_d_release(struct dentry *dentry) -{ - /* did cgroup_diput() tell me to deactivate super? */ - if (dentry->d_fsdata == cgroup_diput) - deactivate_super(dentry->d_sb); -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -959,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) dget(d); d_delete(d); - simple_unlink(d->d_inode, d); + simple_unlink(cgrp->dentry->d_inode, d); list_del_init(&cfe->node); dput(d); @@ -1073,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgrp); - mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); - mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); - mutex_unlock(&ss->hierarchy_mutex); /* subsystem is now free - drop reference on module */ module_put(ss->module); } else if (bit & final_bits) { @@ -1542,7 +1533,6 @@ static int cgroup_get_rootdir(struct super_block *sb) static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, .d_delete = cgroup_delete, - .d_release = cgroup_d_release, }; struct inode *inode = @@ -1593,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, opts.new_root = new_root; /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_drop_root(opts.new_root); @@ -2576,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -3889,8 +3879,12 @@ static void css_dput_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, dput_work); + struct dentry *dentry = css->cgroup->dentry; + struct super_block *sb = dentry->d_sb; - dput(css->cgroup->dentry); + atomic_inc(&sb->s_active); + dput(dentry); + deactivate_super(sb); } static void init_cgroup_css(struct cgroup_subsys_state *css, @@ -3917,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } -static void cgroup_lock_hierarchy(struct cgroupfs_root *root) -{ - /* We need to take each hierarchy_mutex in a consistent order */ - int i; - - /* - * No worry about a race with rebind_subsystems that might mess up the - * locking order, since both parties are under cgroup_mutex. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_lock(&ss->hierarchy_mutex); - } -} - -static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) -{ - int i; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_unlock(&ss->hierarchy_mutex); - } -} - /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4008,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, ss->post_clone(cgrp); } - cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); - cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -4037,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: - cgroup_lock_hierarchy(root); list_del(&cgrp->sibling); - cgroup_unlock_hierarchy(root); root->number_of_cgroups--; err_destroy: @@ -4247,10 +4206,8 @@ again: list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ list_del_init(&cgrp->sibling); - cgroup_unlock_hierarchy(cgrp->root); list_del_init(&cgrp->allcg_node); @@ -4324,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* this function shouldn't be used with modular subsystems, since they @@ -4452,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* success! */ @@ -4982,10 +4935,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int v; rcu_read_lock(); - atomic_dec(&css->refcnt); - switch (css_refcnt(css)) { + v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); + + switch (v) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb522..14d3258 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } #endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd65..f33c715 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -147,6 +147,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +/* the type of hotplug event */ +enum hotplug_event { + CPUSET_CPU_OFFLINE, + CPUSET_MEM_OFFLINE, +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. + * Helper function to traverse cpusets. + * It can be used to walk the cpuset tree from top to bottom, completing + * one layer before dropping down to the next (thus always processing a + * node before any of its children). + */ +static struct cpuset *cpuset_next(struct list_head *queue) +{ + struct cpuset *cp; + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + + if (list_empty(queue)) + return NULL; + + cp = list_first_entry(queue, struct cpuset, stack_list); + list_del(queue->next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, queue); + } + + return cp; +} + + +/* + * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory + * online/offline) and update the cpusets accordingly. + * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such + * cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. @@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * before dropping down to the next. It always processes a node before * any of its children. * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * if all present pages from a node are offlined. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void +scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) { LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); + switch (event) { + case CPUSET_CPU_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { + + /* Continue past cpusets with all cpus online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) + continue; + + /* Remove offline cpus from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_cpumask(cp, NULL); } + break; - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; + case CPUSET_MEM_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { - oldmems = cp->mems_allowed; + /* Continue past cpusets with all mems online */ + if (nodes_subset(cp->mems_allowed, + node_states[N_HIGH_MEMORY])) + continue; - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, + oldmems = cp->mems_allowed; + + /* Remove offline mems from this cpuset. */ + mutex_lock(&callback_mutex); + nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&callback_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + /* Move tasks from the empty cpuset to a parent */ + if (nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_nodemask(cp, &oldmems, NULL); } } } @@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). + * + * @cpu_online: Indicates whether this is a CPU online event (true) or + * a CPU offline event (false). */ -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { struct sched_domain_attr *attr; cpumask_var_t *doms; @@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void) mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); - scan_for_empty_cpusets(&top_cpuset); + + if (!cpu_online) + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); + ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void) /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). + * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) @@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_OFFLINE: /* * needn't update top_cpuset.mems_allowed explicitly because - * scan_for_empty_cpusets() will update it. + * scan_cpusets_upon_hotplug() will update it. */ - scan_for_empty_cpusets(&top_cpuset); + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); break; default: break; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8b68ce7..be7b33b 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -12,6 +12,7 @@ #include <linux/kdb.h> #include <linux/kdebug.h> #include <linux/export.h> +#include <linux/hardirq.h> #include "kdb_private.h" #include "../debug_core.h" @@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks) if (atomic_read(&kgdb_setting_breakpoint)) reason = KDB_REASON_KEYBOARD; + if (in_nmi()) + reason = KDB_REASON_NMI; + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { if ((bp->bp_enabled) && (bp->bp_addr == addr)) { reason = KDB_REASON_BREAK; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index bb9520f..0a69d2a 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -715,9 +715,6 @@ kdb_printit: /* check for having reached the LINES number of printed lines */ if (kdb_nextline == linecount) { char buf1[16] = ""; -#if defined(CONFIG_SMP) - char buf2[32]; -#endif /* Watch out for recursion here. Any routine that calls * kdb_printf will come back through here. And kdb_read @@ -732,14 +729,6 @@ kdb_printit: if (moreprompt == NULL) moreprompt = "more> "; -#if defined(CONFIG_SMP) - if (strchr(moreprompt, '%')) { - sprintf(buf2, moreprompt, get_cpu()); - put_cpu(); - moreprompt = buf2; - } -#endif - kdb_input_flush(); c = console_drivers; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 67b847d..31df170 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -14,6 +14,7 @@ #include <linux/ctype.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/kmsg_dump.h> #include <linux/reboot.h> #include <linux/sched.h> #include <linux/sysrq.h> @@ -138,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); static char *__env[] = { #if defined(CONFIG_SMP) "PROMPT=[%d]kdb> ", - "MOREPROMPT=[%d]more> ", #else "PROMPT=kdb> ", - "MOREPROMPT=more> ", #endif + "MOREPROMPT=more> ", "RADIX=16", "MDCOUNT=8", /* lines of md output */ KDB_PLATFORM_ENV, @@ -1235,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *cmdbuf = '\0'; *(cmd_hist[cmd_head]) = '\0'; - if (KDB_FLAG(ONLY_DO_DUMP)) { - /* kdb is off but a catastrophic error requires a dump. - * Take the dump and reboot. - * Turn on logging so the kdb output appears in the log - * buffer in the dump. - */ - const char *setargs[] = { "set", "LOGGING", "1" }; - kdb_set(2, setargs); - kdb_reboot(0, NULL); - /*NOTREACHED*/ - } - do_full_getstr: #if defined(CONFIG_SMP) snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), @@ -2040,8 +2028,15 @@ static int kdb_env(int argc, const char **argv) */ static int kdb_dmesg(int argc, const char **argv) { - char *syslog_data[4], *start, *end, c = '\0', *p; - int diag, logging, logsize, lines = 0, adjust = 0, n; + int diag; + int logging; + int lines = 0; + int adjust = 0; + int n = 0; + int skip = 0; + struct kmsg_dumper dumper = { .active = 1 }; + size_t len; + char buf[201]; if (argc > 2) return KDB_ARGCOUNT; @@ -2064,22 +2059,10 @@ static int kdb_dmesg(int argc, const char **argv) kdb_set(2, setargs); } - /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] - * logical start, end+1. */ - kdb_syslog_data(syslog_data); - if (syslog_data[2] == syslog_data[3]) - return 0; - logsize = syslog_data[1] - syslog_data[0]; - start = syslog_data[2]; - end = syslog_data[3]; -#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) - for (n = 0, p = start; p < end; ++p) { - c = *KDB_WRAP(p); - if (c == '\n') - ++n; - } - if (c != '\n') - ++n; + kmsg_dump_rewind_nolock(&dumper); + while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) + n++; + if (lines < 0) { if (adjust >= n) kdb_printf("buffer only contains %d lines, nothing " @@ -2087,21 +2070,11 @@ static int kdb_dmesg(int argc, const char **argv) else if (adjust - lines >= n) kdb_printf("buffer only contains %d lines, last %d " "lines printed\n", n, n - adjust); - if (adjust) { - for (; start < end && adjust; ++start) { - if (*KDB_WRAP(start) == '\n') - --adjust; - } - if (start < end) - ++start; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - ++lines; - } - end = p; + skip = adjust; + lines = abs(lines); } else if (lines > 0) { - int skip = n - (adjust + lines); + skip = n - lines - adjust; + lines = abs(lines); if (adjust >= n) { kdb_printf("buffer only contains %d lines, " "nothing printed\n", n); @@ -2112,35 +2085,24 @@ static int kdb_dmesg(int argc, const char **argv) kdb_printf("buffer only contains %d lines, first " "%d lines printed\n", n, lines); } - for (; start < end && skip; ++start) { - if (*KDB_WRAP(start) == '\n') - --skip; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - --lines; - } - end = p; + } else { + lines = n; } - /* Do a line at a time (max 200 chars) to reduce protocol overhead */ - c = '\n'; - while (start != end) { - char buf[201]; - p = buf; - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - while (start < end && (c = *KDB_WRAP(start)) && - (p - buf) < sizeof(buf)-1) { - ++start; - *p++ = c; - if (c == '\n') - break; + + if (skip >= n || skip < 0) + return 0; + + kmsg_dump_rewind_nolock(&dumper); + while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { + if (skip) { + skip--; + continue; } - *p = '\0'; - kdb_printf("%s", buf); + if (!lines--) + break; + + kdb_printf("%.*s\n", (int)len - 1, buf); } - if (c != '\n') - kdb_printf("\n"); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 47c4e56..392ec6a 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -205,7 +205,6 @@ extern char kdb_grep_string[]; extern int kdb_grep_leading; extern int kdb_grep_trailing; extern char *kdb_cmds[]; -extern void kdb_syslog_data(char *syslog_data[]); extern unsigned long kdb_task_state_string(const char *); extern char kdb_task_state_char (const struct task_struct *); extern unsigned long kdb_task_state(const struct task_struct *p, diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6581a04..98d4597 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -153,7 +153,8 @@ put_callchain_entry(int rctx) put_recursion_context(__get_cpu_var(callchain_recursion), rctx); } -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) { int rctx; struct perf_callchain_entry *entry; @@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) } if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f85c015..b7935fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) return !event->cgrp || event->cgrp == cpuctx->cgrp; } -static inline void perf_get_cgroup(struct perf_event *event) +static inline bool perf_tryget_cgroup(struct perf_event *event) { - css_get(&event->cgrp->css); + return css_tryget(&event->cgrp->css); } static inline void perf_put_cgroup(struct perf_event *event) @@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, event->cgrp = cgrp; /* must be done before we fput() the file */ - perf_get_cgroup(event); + if (!perf_tryget_cgroup(event)) { + event->cgrp = NULL; + ret = -ENOENT; + goto out; + } /* * all events in a group must monitor @@ -1641,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); event->ctx = ctx; + if (event->cpu != -1) + event->cpu = cpu; if (!task) { /* @@ -4033,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(regs); + data->callchain = perf_callchain(event, regs); if (data->callchain) size += data->callchain->nr; @@ -5203,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) + struct pt_regs *regs, struct hlist_head *head, int rctx, + struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; @@ -5222,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_event(event, count, &data, regs); } + /* + * If we got specified a target task, also iterate its context and + * deliver this event there too. + */ + if (task && task != current) { + struct perf_event_context *ctx; + struct trace_entry *entry = record; + + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + if (!ctx) + goto unlock; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +unlock: + rcu_read_unlock(); + } + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -6248,6 +6280,8 @@ SYSCALL_DEFINE5(perf_event_open, } } + get_online_cpus(); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { @@ -6300,7 +6334,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, cpu); + ctx = find_get_context(pmu, task, event->cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -6373,20 +6407,23 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); + synchronize_rcu(); + perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, cpu); + perf_install_in_context(ctx, sibling, event->cpu); get_ctx(ctx); } } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); + put_online_cpus(); + event->owner = current; mutex_lock(¤t->perf_event_mutex); @@ -6415,6 +6452,7 @@ err_context: err_alloc: free_event(event); err_task: + put_online_cpus(); if (task) put_task_struct(task); err_group_fd: @@ -6475,6 +6513,39 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx; + struct perf_event_context *dst_ctx; + struct perf_event *event, *tmp; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + + mutex_lock(&src_ctx->mutex); + list_for_each_entry_safe(event, tmp, &src_ctx->event_list, + event_entry) { + perf_remove_from_context(event); + put_ctx(src_ctx); + list_add(&event->event_entry, &events); + } + mutex_unlock(&src_ctx->mutex); + + synchronize_rcu(); + + mutex_lock(&dst_ctx->mutex); + list_for_each_entry_safe(event, tmp, &events, event_entry) { + list_del(&event->event_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + mutex_unlock(&dst_ctx->mutex); +} +EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); + static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b0b107f..a096c19 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle, } /* Callchain handling */ -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 985be4d..c08a22d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -32,19 +32,36 @@ #include <linux/swap.h> /* try_to_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ +#include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/uprobes.h> #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE -static struct srcu_struct uprobes_srcu; static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 +/* + * We need separate register/unregister and mmap/munmap lock hashes because + * of mmap_sem nesting. + * + * uprobe_register() needs to install probes on (potentially) all processes + * and thus needs to acquire multiple mmap_sems (consequtively, not + * concurrently), whereas uprobe_mmap() is called while holding mmap_sem + * for the particular process doing the mmap. + * + * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem + * because of lock order against i_mmap_mutex. This means there's a hole in + * the register vma iteration where a mmap() can happen. + * + * Thus uprobe_register() can race with uprobe_mmap() and we can try and + * install a probe where one is already installed. + */ + /* serialize (un)register */ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; @@ -61,17 +78,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); -/* - * Maintain a temporary per vma info that can be used to search if a vma - * has already been handled. This structure is introduced since extending - * vm_area_struct wasnt recommended. - */ -struct vma_info { - struct list_head probe_list; - struct mm_struct *mm; - loff_t vaddr; -}; - struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; @@ -100,20 +106,21 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) if (!is_register) return true; - if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) + if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) + == (VM_READ|VM_EXEC)) return true; return false; } -static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) +static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { - loff_t vaddr; - - vaddr = vma->vm_start + offset; - vaddr -= vma->vm_pgoff << PAGE_SHIFT; + return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); +} - return vaddr; +static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) +{ + return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } /** @@ -121,41 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) * based on replace_page in mm/ksm.c * * @vma: vma that holds the pte pointing to page + * @addr: address the old @page is mapped at * @page: the cowed page we are replacing by kpage * @kpage: the modified page we replace page by * * Returns 0 on success, -EFAULT on failure. */ -static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) +static int __replace_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; spinlock_t *ptl; - unsigned long addr; - int err = -EFAULT; - - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - goto out; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; + pte_t *ptep; + int err; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; + /* For try_to_free_swap() and munlock_vma_page() below */ + lock_page(page); - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + err = -EAGAIN; + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + goto unlock; get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -172,11 +165,15 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page_remove_rmap(page); if (!page_mapped(page)) try_to_free_swap(page); - put_page(page); pte_unmap_unlock(ptep, ptl); - err = 0; -out: + if (vma->vm_flags & VM_LOCKED) + munlock_vma_page(page); + put_page(page); + + err = 0; + unlock: + unlock_page(page); return err; } @@ -218,79 +215,46 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - struct address_space *mapping; void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; - struct uprobe *uprobe; - loff_t addr; int ret; +retry: /* Read the page with vaddr into memory */ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) return ret; - ret = -EINVAL; - - /* - * We are interested in text pages only. Our pages of interest - * should be mapped for read and execute only. We desist from - * adding probes in write mapped pages since the breakpoints - * might end up in the file copy. - */ - if (!valid_vma(vma, is_swbp_insn(&opcode))) - goto put_out; - - uprobe = container_of(auprobe, struct uprobe, arch); - mapping = uprobe->inode->i_mapping; - if (mapping != vma->vm_file->f_mapping) - goto put_out; - - addr = vma_address(vma, uprobe->offset); - if (vaddr != (unsigned long)addr) - goto put_out; - ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) - goto put_out; + goto put_old; __SetPageUptodate(new_page); - /* - * lock page will serialize against do_wp_page()'s - * PageAnon() handling - */ - lock_page(old_page); /* copy the page now that we've got it stable */ vaddr_old = kmap_atomic(old_page); vaddr_new = kmap_atomic(new_page); memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - - /* poke the new insn in, ASSUMES we don't cross page boundary */ - vaddr &= ~PAGE_MASK; - BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); ret = anon_vma_prepare(vma); if (ret) - goto unlock_out; + goto put_new; - lock_page(new_page); - ret = __replace_page(vma, old_page, new_page); - unlock_page(new_page); + ret = __replace_page(vma, vaddr, old_page, new_page); -unlock_out: - unlock_page(old_page); +put_new: page_cache_release(new_page); - -put_out: +put_old: put_page(old_page); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } @@ -312,7 +276,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ void *vaddr_new; int ret; - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (ret <= 0) return ret; @@ -333,10 +297,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (current->mm == mm) { + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + } + result = read_opcode(mm, vaddr, &opcode); if (result) return result; - +out: if (is_swbp_insn(&opcode)) return 1; @@ -355,7 +329,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { int result; - + /* + * See the comment near uprobes_hash(). + */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) return -EEXIST; @@ -520,7 +496,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); - INIT_LIST_HEAD(&uprobe->pending_list); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -588,20 +563,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) } static int -__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, - unsigned long nbytes, unsigned long offset) +__copy_insn(struct address_space *mapping, struct file *filp, char *insn, + unsigned long nbytes, loff_t offset) { - struct file *filp = vma->vm_file; struct page *page; void *vaddr; - unsigned long off1; - unsigned long idx; + unsigned long off; + pgoff_t idx; if (!filp) return -EINVAL; - idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); - off1 = offset &= ~PAGE_MASK; + if (!mapping->a_ops->readpage) + return -EIO; + + idx = offset >> PAGE_CACHE_SHIFT; + off = offset & ~PAGE_MASK; /* * Ensure that the page that has the original instruction is @@ -612,22 +589,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins return PTR_ERR(page); vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off1, nbytes); + memcpy(insn, vaddr + off, nbytes); kunmap_atomic(vaddr); page_cache_release(page); return 0; } -static int -copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping; unsigned long nbytes; int bytes; - addr &= ~PAGE_MASK; - nbytes = PAGE_SIZE - addr; + nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); mapping = uprobe->inode->i_mapping; /* Instruction at end of binary; copy only available bytes */ @@ -638,13 +613,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes)) - return -ENOMEM; - + int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes); + if (err) + return err; bytes = nbytes; } - return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); + return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } /* @@ -672,9 +647,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) */ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, loff_t vaddr) + struct vm_area_struct *vma, unsigned long vaddr) { - unsigned long addr; int ret; /* @@ -687,20 +661,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (!uprobe->consumers) return -EEXIST; - addr = (unsigned long)vaddr; - if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma, addr); + ret = copy_insn(uprobe, vma->vm_file); if (ret) return ret; if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -EEXIST; + return -ENOTSUPP; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) return ret; + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + uprobe->flags |= UPROBE_COPY_INSN; } @@ -713,7 +689,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence increment before and decrement on failure. */ atomic_inc(&mm->uprobes_state.count); - ret = set_swbp(&uprobe->arch, mm, addr); + ret = set_swbp(&uprobe->arch, mm, vaddr); if (ret) atomic_dec(&mm->uprobes_state.count); @@ -721,27 +697,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, } static void -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) + if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) atomic_dec(&mm->uprobes_state.count); } /* - * There could be threads that have hit the breakpoint and are entering the - * notifier code and trying to acquire the uprobes_treelock. The thread - * calling delete_uprobe() that is removing the uprobe from the rb_tree can - * race with these threads and might acquire the uprobes_treelock compared - * to some of the breakpoint hit threads. In such a case, the breakpoint - * hit threads will not find the uprobe. The current unregistering thread - * waits till all other threads have hit a breakpoint, to acquire the - * uprobes_treelock before the uprobe is removed from the rbtree. + * There could be threads that have already hit the breakpoint. They + * will recheck the current insn and restart if find_uprobe() fails. + * See find_active_uprobe(). */ static void delete_uprobe(struct uprobe *uprobe) { unsigned long flags; - synchronize_srcu(&uprobes_srcu); spin_lock_irqsave(&uprobes_treelock, flags); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock_irqrestore(&uprobes_treelock, flags); @@ -750,139 +720,136 @@ static void delete_uprobe(struct uprobe *uprobe) atomic_dec(&uprobe_events); } -static struct vma_info * -__find_next_vma_info(struct address_space *mapping, struct list_head *head, - struct vma_info *vi, loff_t offset, bool is_register) +struct map_info { + struct map_info *next; + struct mm_struct *mm; + unsigned long vaddr; +}; + +static inline struct map_info *free_map_info(struct map_info *info) +{ + struct map_info *next = info->next; + kfree(info); + return next; +} + +static struct map_info * +build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { + unsigned long pgoff = offset >> PAGE_SHIFT; struct prio_tree_iter iter; struct vm_area_struct *vma; - struct vma_info *tmpvi; - unsigned long pgoff; - int existing_vma; - loff_t vaddr; - - pgoff = offset >> PAGE_SHIFT; + struct map_info *curr = NULL; + struct map_info *prev = NULL; + struct map_info *info; + int more = 0; + again: + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; - existing_vma = 0; - vaddr = vma_address(vma, offset); - - list_for_each_entry(tmpvi, head, probe_list) { - if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { - existing_vma = 1; - break; - } + if (!prev && !more) { + /* + * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * reclaim. This is optimistic, no harm done if it fails. + */ + prev = kmalloc(sizeof(struct map_info), + GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (prev) + prev->next = NULL; } - - /* - * Another vma needs a probe to be installed. However skip - * installing the probe if the vma is about to be unlinked. - */ - if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { - vi->mm = vma->vm_mm; - vi->vaddr = vaddr; - list_add(&vi->probe_list, head); - - return vi; + if (!prev) { + more++; + continue; } - } - - return NULL; -} -/* - * Iterate in the rmap prio tree and find a vma where a probe has not - * yet been inserted. - */ -static struct vma_info * -find_next_vma_info(struct address_space *mapping, struct list_head *head, - loff_t offset, bool is_register) -{ - struct vma_info *vi, *retvi; + if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + continue; - vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); - if (!vi) - return ERR_PTR(-ENOMEM); + info = prev; + prev = prev->next; + info->next = curr; + curr = info; - mutex_lock(&mapping->i_mmap_mutex); - retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); + info->mm = vma->vm_mm; + info->vaddr = offset_to_vaddr(vma, offset); + } mutex_unlock(&mapping->i_mmap_mutex); - if (!retvi) - kfree(vi); + if (!more) + goto out; + + prev = curr; + while (curr) { + mmput(curr->mm); + curr = curr->next; + } - return retvi; + do { + info = kmalloc(sizeof(struct map_info), GFP_KERNEL); + if (!info) { + curr = ERR_PTR(-ENOMEM); + goto out; + } + info->next = prev; + prev = info; + } while (--more); + + goto again; + out: + while (prev) + prev = free_map_info(prev); + return curr; } static int register_for_each_vma(struct uprobe *uprobe, bool is_register) { - struct list_head try_list; - struct vm_area_struct *vma; - struct address_space *mapping; - struct vma_info *vi, *tmpvi; - struct mm_struct *mm; - loff_t vaddr; - int ret; - - mapping = uprobe->inode->i_mapping; - INIT_LIST_HEAD(&try_list); + struct map_info *info; + int err = 0; - ret = 0; + info = build_map_info(uprobe->inode->i_mapping, + uprobe->offset, is_register); + if (IS_ERR(info)) + return PTR_ERR(info); - for (;;) { - vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); - if (!vi) - break; + while (info) { + struct mm_struct *mm = info->mm; + struct vm_area_struct *vma; - if (IS_ERR(vi)) { - ret = PTR_ERR(vi); - break; - } + if (err) + goto free; - mm = vi->mm; - down_read(&mm->mmap_sem); - vma = find_vma(mm, (unsigned long)vi->vaddr); - if (!vma || !valid_vma(vma, is_register)) { - list_del(&vi->probe_list); - kfree(vi); - up_read(&mm->mmap_sem); - mmput(mm); - continue; - } - vaddr = vma_address(vma, uprobe->offset); - if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != vi->vaddr) { - list_del(&vi->probe_list); - kfree(vi); - up_read(&mm->mmap_sem); - mmput(mm); - continue; - } + down_write(&mm->mmap_sem); + vma = find_vma(mm, info->vaddr); + if (!vma || !valid_vma(vma, is_register) || + vma->vm_file->f_mapping->host != uprobe->inode) + goto unlock; - if (is_register) - ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); - else - remove_breakpoint(uprobe, mm, vi->vaddr); + if (vma->vm_start > info->vaddr || + vaddr_to_offset(vma, info->vaddr) != uprobe->offset) + goto unlock; - up_read(&mm->mmap_sem); - mmput(mm); if (is_register) { - if (ret && ret == -EEXIST) - ret = 0; - if (ret) - break; + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + /* + * We can race against uprobe_mmap(), see the + * comment near uprobe_hash(). + */ + if (err == -EEXIST) + err = 0; + } else { + remove_breakpoint(uprobe, mm, info->vaddr); } + unlock: + up_write(&mm->mmap_sem); + free: + mmput(mm); + info = free_map_info(info); } - list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { - list_del(&vi->probe_list); - kfree(vi); - } - - return ret; + return err; } static int __uprobe_register(struct uprobe *uprobe) @@ -977,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume put_uprobe(uprobe); } -/* - * Of all the nodes that correspond to the given inode, return the node - * with the least offset. - */ -static struct rb_node *find_least_offset_node(struct inode *inode) +static struct rb_node * +find_node_in_range(struct inode *inode, loff_t min, loff_t max) { - struct uprobe u = { .inode = inode, .offset = 0}; struct rb_node *n = uprobes_tree.rb_node; - struct rb_node *close_node = NULL; - struct uprobe *uprobe; - int match; while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); + struct uprobe *u = rb_entry(n, struct uprobe, rb_node); - if (uprobe->inode == inode) - close_node = n; - - if (!match) - return close_node; - - if (match < 0) + if (inode < u->inode) { n = n->rb_left; - else + } else if (inode > u->inode) { n = n->rb_right; + } else { + if (max < u->offset) + n = n->rb_left; + else if (min > u->offset) + n = n->rb_right; + else + break; + } } - return close_node; + return n; } /* - * For a given inode, build a list of probes that need to be inserted. + * For a given range in vma, build a list of probes that need to be inserted. */ -static void build_probe_list(struct inode *inode, struct list_head *head) +static void build_probe_list(struct inode *inode, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *head) { - struct uprobe *uprobe; + loff_t min, max; unsigned long flags; - struct rb_node *n; - - spin_lock_irqsave(&uprobes_treelock, flags); - - n = find_least_offset_node(inode); + struct rb_node *n, *t; + struct uprobe *u; - for (; n; n = rb_next(n)) { - uprobe = rb_entry(n, struct uprobe, rb_node); - if (uprobe->inode != inode) - break; + INIT_LIST_HEAD(head); + min = vaddr_to_offset(vma, start); + max = min + (end - start) - 1; - list_add(&uprobe->pending_list, head); - atomic_inc(&uprobe->ref); + spin_lock_irqsave(&uprobes_treelock, flags); + n = find_node_in_range(inode, min, max); + if (n) { + for (t = n; t; t = rb_prev(t)) { + u = rb_entry(t, struct uprobe, rb_node); + if (u->inode != inode || u->offset < min) + break; + list_add(&u->pending_list, head); + atomic_inc(&u->ref); + } + for (t = n; (t = rb_next(t)); ) { + u = rb_entry(t, struct uprobe, rb_node); + if (u->inode != inode || u->offset > max) + break; + list_add(&u->pending_list, head); + atomic_inc(&u->ref); + } } - spin_unlock_irqrestore(&uprobes_treelock, flags); } @@ -1059,28 +1033,21 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!inode) return 0; - INIT_LIST_HEAD(&tmp_list); mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, &tmp_list); + build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); ret = 0; count = 0; list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - loff_t vaddr; - - list_del(&uprobe->pending_list); if (!ret) { - vaddr = vma_address(vma, uprobe->offset); - - if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { - put_uprobe(uprobe); - continue; - } + unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - - /* Ignore double add: */ + /* + * We can race against uprobe_register(), see the + * comment near uprobe_hash(). + */ if (ret == -EEXIST) { ret = 0; @@ -1121,6 +1088,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) return; + if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ + return; + if (!atomic_read(&vma->vm_mm->uprobes_state.count)) return; @@ -1128,24 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!inode) return; - INIT_LIST_HEAD(&tmp_list); mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, &tmp_list); + build_probe_list(inode, vma, start, end, &tmp_list); list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - loff_t vaddr; - - list_del(&uprobe->pending_list); - vaddr = vma_address(vma, uprobe->offset); - - if (vaddr >= start && vaddr < end) { - /* - * An unregister could have removed the probe before - * unmap. So check before we decrement the count. - */ - if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) - atomic_dec(&vma->vm_mm->uprobes_state.count); - } + unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); + /* + * An unregister could have removed the probe before + * unmap. So check before we decrement the count. + */ + if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) + atomic_dec(&vma->vm_mm->uprobes_state.count); put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); @@ -1378,9 +1341,6 @@ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - if (t->uprobe_srcu_id != -1) - srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); - if (!utask) return; @@ -1398,7 +1358,6 @@ void uprobe_free_utask(struct task_struct *t) void uprobe_copy_process(struct task_struct *t) { t->utask = NULL; - t->uprobe_srcu_id = -1; } /* @@ -1417,7 +1376,6 @@ static struct uprobe_task *add_utask(void) if (unlikely(!utask)) return NULL; - utask->active_uprobe = NULL; current->utask = utask; return utask; } @@ -1479,41 +1437,61 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, bp_vaddr); + if (vma && vma->vm_start <= bp_vaddr) { + if (valid_vma(vma, false)) { + struct inode *inode = vma->vm_file->f_mapping->host; + loff_t offset = vaddr_to_offset(vma, bp_vaddr); + + uprobe = find_uprobe(inode, offset); + } + + if (!uprobe) + *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + } else { + *is_swbp = -EFAULT; + } + up_read(&mm->mmap_sem); + + return uprobe; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { - struct vm_area_struct *vma; struct uprobe_task *utask; struct uprobe *uprobe; - struct mm_struct *mm; unsigned long bp_vaddr; + int uninitialized_var(is_swbp); - uprobe = NULL; bp_vaddr = uprobe_get_swbp_addr(regs); - mm = current->mm; - down_read(&mm->mmap_sem); - vma = find_vma(mm, bp_vaddr); - - if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { - struct inode *inode; - loff_t offset; - - inode = vma->vm_file->f_mapping->host; - offset = bp_vaddr - vma->vm_start; - offset += (vma->vm_pgoff << PAGE_SHIFT); - uprobe = find_uprobe(inode, offset); - } - - srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); - current->uprobe_srcu_id = -1; - up_read(&mm->mmap_sem); + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { - /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + if (is_swbp > 0) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + } else { + /* + * Either we raced with uprobe_unregister() or we can't + * access this memory. The latter is only possible if + * another thread plays with our ->mm. In both cases + * we can simply restart. If this vma was unmapped we + * can pretend this insn was not executed yet and get + * the (correct) SIGSEGV after restart. + */ + instruction_pointer_set(regs, bp_vaddr); + } return; } @@ -1620,7 +1598,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) utask->state = UTASK_BP_HIT; set_thread_flag(TIF_UPROBE); - current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); return 1; } @@ -1655,7 +1632,6 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mutex[i]); mutex_init(&uprobes_mmap_mutex[i]); } - init_srcu_struct(&uprobes_srcu); return register_die_notifier(&uprobe_exception_nb); } diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc..f65345f9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); + /* + * If we are the last child process in a pid namespace to be + * reaped, notify the reaper sleeping zap_pid_ns_processes(). + */ + if (IS_ENABLED(CONFIG_PID_NS)) { + struct task_struct *parent = p->real_parent; + + if ((task_active_pid_ns(parent)->child_reaper == parent) && + list_empty(&parent->children) && + (parent->flags & PF_EXITING)) + wake_up_process(parent); + } } list_del_rcu(&p->thread_group); } @@ -471,7 +483,7 @@ static void close_files(struct files_struct * files) rcu_read_unlock(); for (;;) { unsigned long set; - i = j * __NFDBITS; + i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; @@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; + sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state @@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; } else if (father->signal->has_child_subreaper) { struct task_struct *reaper; @@ -946,14 +953,11 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes, and in - * task_work_add() to avoid the race with exit_task_work(). + * an exiting task cleaning up the robust pi futexes. */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - exit_task_work(tsk); - if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), @@ -988,6 +992,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_work(tsk); check_stack_usage(); exit_thread(); diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b..3bd2280 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -114,6 +114,10 @@ int nr_processes(void) return total; } +void __weak arch_release_task_struct(struct task_struct *tsk) +{ +} + #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; @@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node) return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } -void __weak arch_release_task_struct(struct task_struct *tsk) { } - static inline void free_task_struct(struct task_struct *tsk) { - arch_release_task_struct(tsk); kmem_cache_free(task_struct_cachep, tsk); } #endif +void __weak arch_release_thread_info(struct thread_info *ti) +{ +} + #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR -void __weak arch_release_thread_info(struct thread_info *ti) { } /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a @@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - arch_release_thread_info(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else @@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static void free_thread_info(struct thread_info *ti) { - arch_release_thread_info(ti); kmem_cache_free(thread_info_cache, ti); } @@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); + arch_release_thread_info(tsk->stack); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); + arch_release_task_struct(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -298,14 +302,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; ti = alloc_thread_info_node(tsk, node); - if (!ti) { - free_task_struct(tsk); - return NULL; - } + if (!ti) + goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto out; + goto free_ti; tsk->stack = ti; @@ -333,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return tsk; -out: +free_ti: free_thread_info(ti); +free_tsk: free_task_struct(tsk); return NULL; } @@ -378,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - long pages = vma_pages(mpnt); - mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -pages); + -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len; - len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned long len = vma_pages(mpnt); + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; @@ -1305,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif @@ -1415,7 +1416,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_HLIST_HEAD(&p->task_works); + p->task_works = NULL; /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/futex.c b/kernel/futex.c index e2b0fb9..3717e7b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: @@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (uaddr == uaddr2) + return -EINVAL; + if (!bitset) return -EINVAL; @@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * signal. futex_unlock_pi() will not destroy the lock_ptr nor * the pi_state. */ - WARN_ON(!&q.pi_state); + WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); @@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * fault, unlock the rt_mutex and return the fault to userspace. */ if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) + if (pi_mutex && rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf5..6db7a5e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + + return ktime_get_update_offsets(offs_real, offs_boot); +} + /* * Retrigger next event is called after clock was set * @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); - struct timespec realtime_offset, xtim, wtm, sleep; if (!hrtimer_hres_active()) return; - /* Optimized out for !HIGH_RES */ - get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - + hrtimer_update_base(base); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); } @@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); - /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); return 1; } +/* + * Called from timekeeping code to reprogramm the hrtimer interrupt + * device. If called from the timer interrupt context we defer it to + * softirq context. + */ +void clock_was_set_delayed(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; - entry_time = now = ktime_get(); + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); retry: expires_next.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1330,8 +1339,12 @@ retry: * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. */ - now = ktime_get(); + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; @@ -1343,6 +1356,7 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; + raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); if (delta.tv64 > cpu_base->max_hang_time.tv64) cpu_base->max_hang_time = delta; @@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (cpu_base->clock_was_set) { + cpu_base->clock_was_set = 0; + clock_was_set(); + } + hrtimer_peek_ahead_timers(); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index bdb1803..131ca17 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,7 +133,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t retval = IRQ_NONE; - unsigned int random = 0, irq = desc->irq_data.irq; + unsigned int flags = 0, irq = desc->irq_data.irq; do { irqreturn_t res; @@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: - random |= action->flags; + flags |= action->flags; break; default: @@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); - if (random & IRQF_SAMPLE_RANDOM) - add_interrupt_randomness(irq); + add_interrupt_randomness(irq, flags); if (!noirqdebug) note_interrupt(irq, desc, retval); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 41c1564..49a7772 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/topology.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> @@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, { struct irq_domain *domain; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); + domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, + of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; @@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, } /** + * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @size: total number of irqs in mapping + * @first_irq: first number of irq block assigned to the domain + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Allocates a legacy irq_domain if irq_base is positive or a linear + * domain otherwise. + * + * This is intended to implement the expected behaviour for most + * interrupt controllers which is that a linear mapping should + * normally be used unless the system requires a legacy mapping in + * order to support supplying interrupt numbers during non-DT + * registration of devices. + */ +struct irq_domain *irq_domain_add_simple(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data) +{ + if (first_irq > 0) + return irq_domain_add_legacy(of_node, size, first_irq, 0, + ops, host_data); + else + return irq_domain_add_linear(of_node, size, ops, host_data); +} + +/** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in legacy mapping @@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, * one can then use irq_create_mapping() to * explicitly change them */ - ops->map(domain, irq, hwirq); + if (ops->map) + ops->map(domain, irq, hwirq); /* Clear norequest flags */ irq_clear_status_flags(irq, IRQ_NOREQUEST); @@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. + * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: Number of interrupts in the domain. * @ops: map/unmap domain callbacks @@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, struct irq_domain *domain; unsigned int *revmap; - revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); + revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, + of_node_to_nid(of_node)); if (WARN_ON(!revmap)) return NULL; @@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) +static void irq_domain_disassociate_many(struct irq_domain *domain, + unsigned int irq_base, int count) { - struct irq_data *irq_data = irq_get_irq_data(virq); + /* + * disassociate in reverse order; + * not strictly necessary, but nice for unwinding + */ + while (count--) { + int irq = irq_base + count; + struct irq_data *irq_data = irq_get_irq_data(irq); + irq_hw_number_t hwirq = irq_data->hwirq; + + if (WARN_ON(!irq_data || irq_data->domain != domain)) + continue; + + irq_set_status_flags(irq, IRQ_NOREQUEST); + + /* remove chip and handler */ + irq_set_chip_and_handler(irq, NULL, NULL); + + /* Make sure it's completed */ + synchronize_irq(irq); + + /* Tell the PIC about it */ + if (domain->ops->unmap) + domain->ops->unmap(domain, irq); + smp_mb(); - irq_data->hwirq = hwirq; - irq_data->domain = domain; - if (domain->ops->map(domain, virq, hwirq)) { - pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq); irq_data->domain = NULL; irq_data->hwirq = 0; - return -1; + + /* Clear reverse map */ + switch(domain->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = 0; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_data.tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + break; + } } +} + +int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, + irq_hw_number_t hwirq_base, int count) +{ + unsigned int virq = irq_base; + irq_hw_number_t hwirq = hwirq_base; + int i, ret; + + pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, + of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + + for (i = 0; i < count; i++) { + struct irq_data *irq_data = irq_get_irq_data(virq + i); + + if (WARN(!irq_data, "error: irq_desc not allocated; " + "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) + return -EINVAL; + if (WARN(irq_data->domain, "error: irq_desc already associated; " + "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) + return -EINVAL; + }; + + for (i = 0; i < count; i++, virq++, hwirq++) { + struct irq_data *irq_data = irq_get_irq_data(virq); + + irq_data->hwirq = hwirq; + irq_data->domain = domain; + if (domain->ops->map) { + ret = domain->ops->map(domain, virq, hwirq); + if (ret != 0) { + pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", + virq, hwirq, ret); + WARN_ON(1); + irq_data->domain = NULL; + irq_data->hwirq = 0; + goto err_unmap; + } + } - irq_clear_status_flags(virq, IRQ_NOREQUEST); + switch (domain->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = virq; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); + break; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); + } return 0; + + err_unmap: + irq_domain_disassociate_many(domain, irq_base, i); + return -EINVAL; } +EXPORT_SYMBOL_GPL(irq_domain_associate_many); /** * irq_create_direct_mapping() - Allocate an irq for direct mapping @@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) if (domain == NULL) domain = irq_default_domain; - BUG_ON(domain == NULL); - WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) + return 0; - virq = irq_alloc_desc_from(1, 0); + virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; @@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) } pr_debug("create_direct obtained virq %d\n", virq); - if (irq_setup_virq(domain, virq, virq)) { + if (irq_domain_associate(domain, virq, virq)) { irq_free_desc(virq); return 0; } @@ -433,27 +555,64 @@ unsigned int irq_create_mapping(struct irq_domain *domain, hint = hwirq % nr_irqs; if (hint == 0) hint++; - virq = irq_alloc_desc_from(hint, 0); + virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); if (virq <= 0) - virq = irq_alloc_desc_from(1, 0); + virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; } - if (irq_setup_virq(domain, virq, hwirq)) { - if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) - irq_free_desc(virq); + if (irq_domain_associate(domain, virq, hwirq)) { + irq_free_desc(virq); return 0; } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); + hwirq, of_node_full_name(domain->of_node), virq); return virq; } EXPORT_SYMBOL_GPL(irq_create_mapping); +/** + * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs + * @domain: domain owning the interrupt range + * @irq_base: beginning of linux IRQ range + * @hwirq_base: beginning of hardware IRQ range + * @count: Number of interrupts to map + * + * This routine is used for allocating and mapping a range of hardware + * irqs to linux irqs where the linux irq numbers are at pre-defined + * locations. For use by controllers that already have static mappings + * to insert in to the domain. + * + * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time + * domain insertion. + * + * 0 is returned upon success, while any failure to establish a static + * mapping is treated as an error. + */ +int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, + irq_hw_number_t hwirq_base, int count) +{ + int ret; + + ret = irq_alloc_descs(irq_base, irq_base, count, + of_node_to_nid(domain->of_node)); + if (unlikely(ret < 0)) + return ret; + + ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); + if (unlikely(ret < 0)) { + irq_free_descs(irq_base, count); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(irq_create_strict_mappings); + unsigned int irq_create_of_mapping(struct device_node *controller, const u32 *intspec, unsigned int intsize) { @@ -477,7 +636,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return intspec[0]; #endif pr_warning("no irq domain found for %s !\n", - controller->full_name); + of_node_full_name(controller)); return 0; } @@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_domain *domain; - irq_hw_number_t hwirq; if (!virq || !irq_data) return; @@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq) if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) return; - irq_set_status_flags(virq, IRQ_NOREQUEST); - - /* remove chip and handler */ - irq_set_chip_and_handler(virq, NULL, NULL); - - /* Make sure it's completed */ - synchronize_irq(virq); - - /* Tell the PIC about it */ - if (domain->ops->unmap) - domain->ops->unmap(domain, virq); - smp_mb(); - - /* Clear reverse map */ - hwirq = irq_data->hwirq; - switch(domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = 0; - break; - case IRQ_DOMAIN_MAP_TREE: - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_data.tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - break; - } - + irq_domain_disassociate_many(domain, virq, 1); irq_free_desc(virq); } EXPORT_SYMBOL_GPL(irq_dispose_mapping); @@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping); * irq_find_mapping() - Find a linux irq from an hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space - * - * This is a slow path, for use by generic code. It's expected that an - * irq controller implementation directly calls the appropriate low level - * mapping function. */ unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int i; - unsigned int hint = hwirq % nr_irqs; + struct irq_data *data; /* Look for default domain if nececssary */ if (domain == NULL) @@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (domain == NULL) return 0; - /* legacy -> bail early */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + switch (domain->revmap_type) { + case IRQ_DOMAIN_MAP_LEGACY: return irq_domain_legacy_revmap(domain, hwirq); - - /* Slow path does a linear search of the map */ - if (hint == 0) - hint = 1; - i = hint; - do { - struct irq_data *data = irq_get_irq_data(i); + case IRQ_DOMAIN_MAP_LINEAR: + return irq_linear_revmap(domain, hwirq); + case IRQ_DOMAIN_MAP_TREE: + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + if (data) + return data->irq; + break; + case IRQ_DOMAIN_MAP_NOMAP: + data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) - return i; - i++; - if (i >= nr_irqs) - i = 1; - } while(i != hint); + return hwirq; + break; + } + return 0; } EXPORT_SYMBOL_GPL(irq_find_mapping); /** - * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space - * - * This is a fast path, for use by irq controller code that uses radix tree - * revmaps - */ -unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - struct irq_data *irq_data; - - if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return irq_find_mapping(domain, hwirq); - - /* - * Freeing an irq can delete nodes along the path to - * do the lookup via call_rcu. - */ - rcu_read_lock(); - irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); - rcu_read_unlock(); - - /* - * If found in radix tree, then fine. - * Else fallback to linear lookup - this should not happen in practice - * as it means that we failed to insert the node in the radix tree. - */ - return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); -} -EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup); - -/** - * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. - * @domain: domain owning this hardware interrupt - * @virq: linux irq number - * @hwirq: hardware irq number in that domain space - * - * This is for use by irq controllers that use a radix tree reverse - * mapping for fast lookup. - */ -void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) -{ - struct irq_data *irq_data = irq_get_irq_data(virq); - - if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return; - - if (virq) { - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); - } -} -EXPORT_SYMBOL_GPL(irq_radix_revmap_insert); - -/** * irq_linear_revmap() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space * - * This is a fast path, for use by irq controller code that uses linear - * revmaps. It does fallback to the slow path if the revmap doesn't exist - * yet and will create the revmap entry with appropriate locking + * This is a fast path that can be called directly by irq controller code to + * save a handful of instructions. */ unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int *revmap; - - if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) - return irq_find_mapping(domain, hwirq); + BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); - /* Check revmap bounds */ - if (unlikely(hwirq >= domain->revmap_data.linear.size)) - return irq_find_mapping(domain, hwirq); - - /* Check if revmap was allocated */ - revmap = domain->revmap_data.linear.revmap; - if (unlikely(revmap == NULL)) - return irq_find_mapping(domain, hwirq); - - /* Fill up revmap with slow path if no mapping found */ - if (unlikely(!revmap[hwirq])) - revmap[hwirq] = irq_find_mapping(domain, hwirq); + /* Check revmap bounds; complain if exceeded */ + if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) + return 0; - return revmap[hwirq]; + return domain->revmap_data.linear.revmap[hwirq]; } EXPORT_SYMBOL_GPL(irq_linear_revmap); @@ -725,8 +784,8 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain && desc->irq_data.domain->of_node) - p = desc->irq_data.domain->of_node->full_name; + if (desc->irq_data.domain) + p = of_node_full_name(desc->irq_data.domain->of_node); else p = none; seq_printf(m, "%s\n", p); @@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void) __initcall(irq_debugfs_init); #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ -static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hwirq) -{ - return 0; -} - /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * @@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); const struct irq_domain_ops irq_domain_simple_ops = { - .map = irq_domain_simple_map, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c54823..4c69326 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc) wake_up(&desc->wait_for_threads); } -static void irq_thread_dtor(struct task_work *unused) +static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; @@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused) */ static int irq_thread(void *data) { - struct task_work on_exit_work; + struct callback_head on_exit_work; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; @@ -830,7 +830,7 @@ static int irq_thread(void *data) sched_setscheduler(current, SCHED_FIFO, ¶m); - init_task_work(&on_exit_work, irq_thread_dtor, NULL); + init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); while (!irq_wait_for_interrupt(action)) { @@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & IRQF_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } /* * Check whether the interrupt nests into another interrupt @@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* + * Drivers are often written to work w/o knowledge about the + * underlying irq chip implementation, so a request for a + * threaded irq without a primary hard irq context handler + * requires the ONESHOT flag to be set. Some irq chips like + * MSI based interrupts are per se one shot safe. Check the + * chip flags, so we can avoid the unmask dance at the end of + * the threaded handler for those. + */ + if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) + new->flags &= ~IRQF_ONESHOT; + + /* * The following block of code has to be executed atomically */ raw_spin_lock_irqsave(&desc->lock, flags); @@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->thread_mask = 1 << ffz(thread_mask); - } else if (new->handler == irq_default_primary_handler) { + } else if (new->handler == irq_default_primary_handler && + !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it @@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq); * Flags: * * IRQF_SHARED Interrupt is shared - * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * IRQF_TRIGGER_* Specify active edge(s) or level * */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 4e2e472..0668d58 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { - vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } diff --git a/kernel/kmod.c b/kernel/kmod.c index ff2c7cb..6f99aea 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,6 +45,13 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; +/* + * kmod_thread_locker is used for deadlock avoidance. There is no explicit + * locking to protect this global - it is private to the singleton khelper + * thread and should only ever be modified by that thread. + */ +static const struct task_struct *kmod_thread_locker; + #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -221,6 +228,13 @@ fail: return 0; } +static int call_helper(void *data) +{ + /* Worker thread started blocking khelper thread. */ + kmod_thread_locker = current; + return ____call_usermodehelper(data); +} + static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) @@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work) if (wait == UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, + else { + pid = kernel_thread(call_helper, sub_info, CLONE_VFORK | SIGCHLD); + /* Worker thread stopped blocking khelper thread. */ + kmod_thread_locker = NULL; + } switch (wait) { case UMH_NO_WAIT: @@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) retval = -EBUSY; goto out; } + /* + * Worker thread must not wait for khelper thread at below + * wait_for_completion() if the thread was created with CLONE_VFORK + * flag, for khelper thread is already waiting for the thread at + * wait_for_completion() in do_fork(). + */ + if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { + retval = -EBUSY; + goto out; + } sub_info->complete = &done; sub_info->wait = wait; @@ -577,6 +604,12 @@ unlock: return retval; } +/* + * call_usermodehelper_fns() will not run the caller-provided cleanup function + * if a memory allocation failure is experienced. So the caller might need to + * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform + * the necessaary cleanup within the caller. + */ int call_usermodehelper_fns( char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633..b579af5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -360,16 +360,12 @@ repeat: struct kthread_work, node); list_del_init(&work->node); } + worker->current_work = work; spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); work->func(work); - smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ - work->done_seq = work->queue_seq; - smp_mb(); /* mb worker-b1 paired with flush-b0 */ - if (atomic_read(&work->flushing)) - wake_up_all(&work->done); } else if (!freezing(current)) schedule(); @@ -378,6 +374,19 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +/* insert @work before @pos in @worker */ +static void insert_kthread_work(struct kthread_worker *worker, + struct kthread_work *work, + struct list_head *pos) +{ + lockdep_assert_held(&worker->lock); + + list_add_tail(&work->node, pos); + work->worker = worker; + if (likely(worker->task)) + wake_up_process(worker->task); +} + /** * queue_kthread_work - queue a kthread_work * @worker: target kthread_worker @@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); if (list_empty(&work->node)) { - list_add_tail(&work->node, &worker->work_list); - work->queue_seq++; - if (likely(worker->task)) - wake_up_process(worker->task); + insert_kthread_work(worker, work, &worker->work_list); ret = true; } spin_unlock_irqrestore(&worker->lock, flags); @@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker, } EXPORT_SYMBOL_GPL(queue_kthread_work); +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + /** * flush_kthread_work - flush a kthread_work * @work: work to flush @@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work); */ void flush_kthread_work(struct kthread_work *work) { - int seq = work->queue_seq; - - atomic_inc(&work->flushing); + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + struct kthread_worker *worker; + bool noop = false; - /* - * mb flush-b0 paired with worker-b1, to make sure either - * worker sees the above increment or we see done_seq update. - */ - smp_mb__after_atomic_inc(); +retry: + worker = work->worker; + if (!worker) + return; - /* A - B <= 0 tests whether B is in front of A regardless of overflow */ - wait_event(work->done, seq - work->done_seq <= 0); - atomic_dec(&work->flushing); + spin_lock_irq(&worker->lock); + if (work->worker != worker) { + spin_unlock_irq(&worker->lock); + goto retry; + } - /* - * rmb flush-b1 paired with worker-b0, to make sure our caller - * sees every change made by work->func(). - */ - smp_mb__after_atomic_dec(); -} -EXPORT_SYMBOL_GPL(flush_kthread_work); + if (!list_empty(&work->node)) + insert_kthread_work(worker, &fwork.work, work->node.next); + else if (worker->current_work == work) + insert_kthread_work(worker, &fwork.work, worker->work_list.next); + else + noop = true; -struct kthread_flush_work { - struct kthread_work work; - struct completion done; -}; + spin_unlock_irq(&worker->lock); -static void kthread_flush_work_fn(struct kthread_work *work) -{ - struct kthread_flush_work *fwork = - container_of(work, struct kthread_flush_work, work); - complete(&fwork->done); + if (!noop) + wait_for_completion(&fwork.done); } +EXPORT_SYMBOL_GPL(flush_kthread_work); /** * flush_kthread_worker - flush all current works on a kthread_worker diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a1..e1b2822 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,7 +27,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -int panic_on_oops; +int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; @@ -75,6 +75,14 @@ void panic(const char *fmt, ...) int state = 0; /* + * Disable local interrupts. This will prevent panic_smp_self_stop + * from deadlocking the first cpu that invokes the panic, since + * there is nothing to prevent an interrupt handler (that runs + * after the panic_lock is acquired) from invoking panic again. + */ + local_irq_disable(); + + /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... @@ -108,8 +116,6 @@ void panic(const char *fmt, ...) */ crash_kexec(NULL); - kmsg_dump(KMSG_DUMP_PANIC); - /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic @@ -117,6 +123,8 @@ void panic(const char *fmt, ...) */ smp_send_stop(); + kmsg_dump(KMSG_DUMP_PANIC); + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); bust_spinlocks(0); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e3..b3c7fd5 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); + /* Firstly reap the EXIT_ZOMBIE children we may have. */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + /* + * sys_wait4() above can't reap the TASK_DEAD children. + * Make sure they all go away, see __unhash_process(). + */ + for (;;) { + bool need_wait = false; + + read_lock(&tasklist_lock); + if (!list_empty(¤t->children)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + need_wait = true; + } + read_unlock(&tasklist_lock); + + if (!need_wait) + break; + schedule(); + } + if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 8f9b4eb..a70518c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -175,7 +175,7 @@ config PM_TEST_SUSPEND You probably want to have your system's RTC driver statically linked, ensuring that it's available when this test runs. -config CAN_PM_TRACE +config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP @@ -196,7 +196,7 @@ config PM_TRACE config PM_TRACE_RTC bool "Suspend/resume event tracing" - depends on CAN_PM_TRACE + depends on PM_SLEEP_DEBUG depends on X86 select PM_TRACE ---help--- diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db3..b26f5f1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -5,6 +5,7 @@ * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. */ @@ -27,7 +28,6 @@ #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/genhd.h> -#include <scsi/scsi_scan.h> #include "power.h" @@ -46,6 +46,9 @@ enum { HIBERNATION_PLATFORM, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, +#ifdef CONFIG_SUSPEND + HIBERNATION_SUSPEND, +#endif /* keep last */ __HIBERNATION_AFTER_LAST }; @@ -354,6 +357,7 @@ int hibernation_snapshot(int platform_mode) } suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -379,6 +383,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); + ftrace_start(); resume_console(); dpm_complete(msg); @@ -481,6 +486,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -488,6 +494,7 @@ int hibernation_restore(int platform_mode) dpm_resume_end(PMSG_RECOVER); } pm_restore_gfp_mask(); + ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -514,6 +521,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -557,6 +565,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + ftrace_start(); resume_console(); Close: @@ -574,6 +583,10 @@ int hibernation_platform_enter(void) */ static void power_down(void) { +#ifdef CONFIG_SUSPEND + int error; +#endif + switch (hibernation_mode) { case HIBERNATION_REBOOT: kernel_restart(NULL); @@ -583,6 +596,25 @@ static void power_down(void) case HIBERNATION_SHUTDOWN: kernel_power_off(); break; +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + if (error) { + if (hibernation_ops) + hibernation_mode = HIBERNATION_PLATFORM; + else + hibernation_mode = HIBERNATION_SHUTDOWN; + power_down(); + } + /* + * Restore swap signature. + */ + error = swsusp_unmark(); + if (error) + printk(KERN_ERR "PM: Swap will be unusable! " + "Try swapon -a.\n"); + return; +#endif } kernel_halt(); /* @@ -748,13 +780,6 @@ static int software_resume(void) async_synchronize_full(); } - /* - * We can't depend on SCSI devices being available after loading - * one of their modules until scsi_complete_async_scans() is - * called and the resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { error = -ENODEV; @@ -827,6 +852,9 @@ static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", +#ifdef CONFIG_SUSPEND + [HIBERNATION_SUSPEND] = "suspend", +#endif }; /* @@ -867,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -907,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif hibernation_mode = mode; break; case HIBERNATION_PLATFORM: diff --git a/kernel/power/main.c b/kernel/power/main.c index 428f8a0..f458238 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init); #endif /* CONFIG_PM_SLEEP */ +#ifdef CONFIG_PM_SLEEP_DEBUG +/* + * pm_print_times: print time taken by devices to suspend and resume. + * + * show() returns whether printing of suspend and resume times is enabled. + * store() accepts 0 or 1. 0 disables printing and 1 enables it. + */ +bool pm_print_times_enabled; + +static ssize_t pm_print_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pm_print_times_enabled); +} + +static ssize_t pm_print_times_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_print_times_enabled = !!val; + return n; +} + +power_attr(pm_print_times); + +static inline void pm_print_times_init(void) +{ + pm_print_times_enabled = !!initcall_debug; +} +#else /* !CONFIG_PP_SLEEP_DEBUG */ +static inline void pm_print_times_init(void) {} +#endif /* CONFIG_PM_SLEEP_DEBUG */ + struct kobject *power_kobj; /** @@ -531,6 +572,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif +#ifdef CONFIG_PM_SLEEP_DEBUG + &pm_print_times_attr.attr, +#endif #endif NULL, }; @@ -566,6 +610,7 @@ static int __init pm_init(void) error = sysfs_create_group(power_kobj, &attr_group); if (error) return error; + pm_print_times_init(); return pm_autosleep_init(); } diff --git a/kernel/power/power.h b/kernel/power/power.h index b0bd4be..7d4b7ff 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -156,6 +156,9 @@ extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); extern void swsusp_close(fmode_t); +#ifdef CONFIG_SUSPEND +extern int swsusp_unmark(void); +#endif /* kernel/power/block_io.c */ extern struct block_device *hib_resume_bdev; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 396d262..c8b7446 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> +#include <linux/ftrace.h> #include <trace/events/power.h> #include "power.h" @@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_stop(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + ftrace_start(); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11e22c0..3c9d764 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle, struct timeval start; struct timeval stop; - printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); - m = nr_to_write / 100; + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; @@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); @@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) - printk(KERN_CONT "\b\b\b\bdone\n"); - else - printk(KERN_CONT "\n"); + printk(KERN_INFO "PM: Image saving done.\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); return ret; } @@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle, printk(KERN_INFO "PM: Using %u thread(s) for compression.\n" - "PM: Compressing and saving image data (%u pages) ... ", + "PM: Compressing and saving image data (%u pages)...\n", nr_threads, nr_to_write); - m = nr_to_write / 100; + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; @@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", - nr_pages / m); + printk(KERN_INFO + "PM: Image saving progress: " + "%3d%%\n", + nr_pages / m * 10); nr_pages++; } if (!off) @@ -761,11 +762,8 @@ out_finish: do_gettimeofday(&stop); if (!ret) ret = err2; - if (!ret) { - printk(KERN_CONT "\b\b\b\bdone\n"); - } else { - printk(KERN_CONT "\n"); - } + if (!ret) + printk(KERN_INFO "PM: Image saving done.\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); out_clean: if (crc) { @@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle, int err2; unsigned nr_pages; - printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); - m = nr_to_read / 100; + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; @@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); @@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) { - printk("\b\b\b\bdone\n"); + printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; - } else - printk("\n"); + } swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return ret; } @@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle, printk(KERN_INFO "PM: Using %u thread(s) for decompression.\n" - "PM: Loading and decompressing image data (%u pages) ... ", + "PM: Loading and decompressing image data (%u pages)...\n", nr_threads, nr_to_read); - m = nr_to_read / 100; + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; @@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO + "PM: Image loading progress: " + "%3d%%\n", + nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); @@ -1344,7 +1345,7 @@ out_finish: } do_gettimeofday(&stop); if (!ret) { - printk("\b\b\b\bdone\n"); + printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; @@ -1357,8 +1358,7 @@ out_finish: } } } - } else - printk("\n"); + } swsusp_show_speed(&start, &stop, nr_to_read, "Read"); out_clean: for (i = 0; i < ring_size; i++) @@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode) blkdev_put(hib_resume_bdev, mode); } +/** + * swsusp_unmark - Unmark swsusp signature in the resume device + */ + +#ifdef CONFIG_SUSPEND +int swsusp_unmark(void) +{ + int error; + + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); + } else { + printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + error = -ENODEV; + } + + /* + * We just returned from suspend, we don't need the image any more. + */ + free_all_swap_pages(root_swap); + + return error; +} +#endif + static int swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd0..4ed81e7 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -24,7 +24,6 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> -#include <scsi/scsi_scan.h> #include <asm/uaccess.h> @@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) * appear. */ wait_for_device_probe(); - scsi_complete_async_scans(); data->swap = -1; data->mode = O_WRONLY; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index c8fba33..8f50de3 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -9,6 +9,7 @@ * manipulate wakelocks on Android. */ +#include <linux/capability.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> @@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf) size_t len; int ret = 0; + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + while (*str && !isspace(*str)) str++; @@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf) size_t len; int ret = 0; + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + len = strlen(buf); if (!len) return -EINVAL; diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2..66a2ea3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -193,12 +193,21 @@ static int console_may_schedule; * separated by ',', and find the message after the ';' character. */ +enum log_flags { + LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NEWLINE = 2, /* text ended with a newline */ + LOG_PREFIX = 4, /* text started with a prefix */ + LOG_CONT = 8, /* text is a fragment of a continuation line */ +}; + struct log { u64 ts_nsec; /* timestamp in nanoseconds */ u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ u16 dict_len; /* length of dictionary buffer */ - u16 level; /* syslog level + facility */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ }; /* @@ -207,9 +216,12 @@ struct log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_PRINTK /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; +static enum log_flags syslog_prev; +static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ static u64 log_first_seq; @@ -217,20 +229,25 @@ static u32 log_first_idx; /* index and sequence number of the next record to store in the buffer */ static u64 log_next_seq; -#ifdef CONFIG_PRINTK static u32 log_next_idx; +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +static enum log_flags console_prev; + /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; static u32 clear_idx; -#define LOG_LINE_MAX 1024 +#define PREFIX_MAX 32 +#define LOG_LINE_MAX 1024 - PREFIX_MAX /* record buffer */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 #else -#define LOG_ALIGN 8 +#define LOG_ALIGN __alignof__(struct log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -286,6 +303,7 @@ static u32 log_next(u32 idx) /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) { @@ -329,8 +347,13 @@ static void log_store(int facility, int level, msg->text_len = text_len; memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; - msg->level = (facility << 3) | (level & 7); - msg->ts_nsec = local_clock(); + msg->facility = facility; + msg->level = level & 7; + msg->flags = flags & 0x1f; + if (ts_nsec > 0) + msg->ts_nsec = ts_nsec; + else + msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = sizeof(struct log) + text_len + dict_len + pad_len; @@ -343,6 +366,7 @@ static void log_store(int facility, int level, struct devkmsg_user { u64 seq; u32 idx; + enum log_flags prev; struct mutex lock; char buf[8192]; }; @@ -365,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, line = buf; for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { + ret = -EFAULT; goto out; + } line += iv[i].iov_len; } @@ -408,27 +434,30 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, struct log *msg; u64 ts_usec; size_t i; + char cont = '-'; size_t len; ssize_t ret; if (!user) return -EBADF; - mutex_lock(&user->lock); - raw_spin_lock(&logbuf_lock); + ret = mutex_lock_interruptible(&user->lock); + if (ret) + return ret; + raw_spin_lock_irq(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } if (user->seq < log_first_seq) { @@ -436,21 +465,38 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } msg = log_from_idx(user->idx); ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); - len = sprintf(user->buf, "%u,%llu,%llu;", - msg->level, user->seq, ts_usec); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + len = sprintf(user->buf, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); + user->prev = msg->flags; /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { unsigned char c = log_text(msg)[i]; - if (c < ' ' || c >= 128) + if (c < ' ' || c >= 127 || c == '\\') len += sprintf(user->buf + len, "\\x%02x", c); else user->buf[len++] = c; @@ -474,7 +520,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, continue; } - if (c < ' ' || c >= 128) { + if (c < ' ' || c >= 127 || c == '\\') { len += sprintf(user->buf + len, "\\x%02x", c); continue; } @@ -486,7 +532,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (len > count) { ret = -EINVAL; @@ -513,7 +559,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); switch (whence) { case SEEK_SET: /* the first record */ @@ -537,7 +583,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -551,14 +597,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; ret = POLLIN|POLLRDNORM; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -582,10 +628,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); file->private_data = user; return 0; @@ -627,6 +673,15 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. + */ + VMCOREINFO_STRUCT_SIZE(log); + VMCOREINFO_OFFSET(log, ts_nsec); + VMCOREINFO_OFFSET(log, len); + VMCOREINFO_OFFSET(log, text_len); + VMCOREINFO_OFFSET(log, dict_len); } #endif @@ -785,44 +840,64 @@ static bool printk_time; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static size_t print_time(u64 ts, char *buf) +{ + unsigned long rem_nsec; + + if (!printk_time) + return 0; + + if (!buf) + return 15; + + rem_nsec = do_div(ts, 1000000000); + return sprintf(buf, "[%5lu.%06lu] ", + (unsigned long)ts, rem_nsec / 1000); +} + static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; + unsigned int prefix = (msg->facility << 3) | msg->level; if (syslog) { if (buf) { - len += sprintf(buf, "<%u>", msg->level); + len += sprintf(buf, "<%u>", prefix); } else { len += 3; - if (msg->level > 9) - len++; - if (msg->level > 99) + if (prefix > 999) + len += 3; + else if (prefix > 99) + len += 2; + else if (prefix > 9) len++; } } - if (printk_time) { - if (buf) { - unsigned long long ts = msg->ts_nsec; - unsigned long rem_nsec = do_div(ts, 1000000000); - - len += sprintf(buf + len, "[%5lu.%06lu] ", - (unsigned long) ts, rem_nsec / 1000); - } else { - len += 15; - } - } - + len += print_time(msg->ts_nsec, buf ? buf + len : NULL); return len; } -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; + bool prefix = true; + bool newline = true; size_t len = 0; + if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) + prefix = false; + + if (msg->flags & LOG_CONT) { + if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) + prefix = false; + + if (!(msg->flags & LOG_NEWLINE)) + newline = false; + } + do { const char *next = memchr(text, '\n', text_size); size_t text_len; @@ -837,19 +912,25 @@ static size_t msg_print_text(const struct log *msg, bool syslog, if (buf) { if (print_prefix(msg, syslog, NULL) + - text_len + 1>= size - len) + text_len + 1 >= size - len) break; - len += print_prefix(msg, syslog, buf + len); + if (prefix) + len += print_prefix(msg, syslog, buf + len); memcpy(buf + len, text, text_len); len += text_len; - buf[len++] = '\n'; + if (next || newline) + buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - len += print_prefix(msg, syslog, NULL); - len += text_len + 1; + if (prefix) + len += print_prefix(msg, syslog, NULL); + len += text_len; + if (next || newline) + len++; } + prefix = true; text = next; } while (text); @@ -860,26 +941,61 @@ static int syslog_print(char __user *buf, int size) { char *text; struct log *msg; - int len; + int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - } - msg = log_from_idx(syslog_idx); - len = msg_print_text(msg, true, text, LOG_LINE_MAX); - syslog_idx = log_next(syslog_idx); - syslog_seq++; - raw_spin_unlock_irq(&logbuf_lock); + while (size > 0) { + size_t n; + size_t skip; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + syslog_prev = 0; + syslog_partial = 0; + } + if (syslog_seq == log_next_seq) { + raw_spin_unlock_irq(&logbuf_lock); + break; + } - if (len > 0 && copy_to_user(buf, text, len)) - len = -EFAULT; + skip = syslog_partial; + msg = log_from_idx(syslog_idx); + n = msg_print_text(msg, syslog_prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); + if (n - syslog_partial <= size) { + /* message fits into buffer, move forward */ + syslog_idx = log_next(syslog_idx); + syslog_seq++; + syslog_prev = msg->flags; + n -= syslog_partial; + syslog_partial = 0; + } else if (!len){ + /* partial read(), remember position */ + n = size; + syslog_partial += n; + } else + n = 0; + raw_spin_unlock_irq(&logbuf_lock); + + if (!n) + break; + + if (copy_to_user(buf, text + skip, n)) { + if (!len) + len = -EFAULT; + break; + } + + len += n; + size -= n; + buf += n; + } kfree(text); return len; @@ -890,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -899,6 +1015,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; + enum log_flags prev; if (clear_seq < log_first_seq) { /* messages are gone, move to first available one */ @@ -909,41 +1026,50 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. - */ + */ seq = clear_seq; idx = clear_idx; + prev = 0; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len += msg_print_text(msg, true, NULL, 0); + len += msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } + + /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; + prev = 0; while (len > size && seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); + len -= msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } - /* last message in this dump */ + /* last message fitting into this dump */ next_seq = log_next_seq; len = 0; + prev = 0; while (len >= 0 && seq < next_seq) { struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; break; } idx = log_next(idx); seq++; + prev = msg->flags; raw_spin_unlock_irq(&logbuf_lock); if (copy_to_user(buf + len, text, textlen)) @@ -956,6 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* messages are gone, move to next one */ seq = log_first_seq; idx = log_first_idx; + prev = 0; } } } @@ -1027,6 +1154,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_print_all(NULL, 0, true); + break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) @@ -1059,6 +1187,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_prev = 0; + syslog_partial = 0; } if (from_file) { /* @@ -1068,19 +1198,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) */ error = log_next_idx - syslog_idx; } else { - u64 seq; - u32 idx; + u64 seq = syslog_seq; + u32 idx = syslog_idx; + enum log_flags prev = syslog_prev; error = 0; - seq = syslog_seq; - idx = syslog_idx; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - error += msg_print_text(msg, true, NULL, 0); + error += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } + error -= syslog_partial; } raw_spin_unlock_irq(&logbuf_lock); break; @@ -1101,21 +1232,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } -#ifdef CONFIG_KGDB_KDB -/* kdb dmesg command needs access to the syslog buffer. do_syslog() - * uses locks so it cannot be used during debugging. Just tell kdb - * where the start and end of the physical and logical logs are. This - * is equivalent to do_syslog(3). - */ -void kdb_syslog_data(char *syslog_data[4]) -{ - syslog_data[0] = log_buf; - syslog_data[1] = log_buf + log_buf_len; - syslog_data[2] = log_buf + log_first_idx; - syslog_data[3] = log_buf + log_next_idx; -} -#endif /* CONFIG_KGDB_KDB */ - static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) @@ -1259,22 +1375,121 @@ static inline void printk_delay(void) } } +/* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments + * though, are printed immediately to the consoles to ensure everything has + * reached the console in case of a kernel crash. + */ +static struct cont { + char buf[LOG_LINE_MAX]; + size_t len; /* length == 0 means unused buffer */ + size_t cons; /* bytes written to console */ + struct task_struct *owner; /* task of first print*/ + u64 ts_nsec; /* time of first print */ + u8 level; /* log level of first message */ + u8 facility; /* log level of first message */ + enum log_flags flags; /* prefix, newline flags */ + bool flushed:1; /* buffer sealed and committed */ +} cont; + +static void cont_flush(enum log_flags flags) +{ + if (cont.flushed) + return; + if (cont.len == 0) + return; + + if (cont.cons) { + /* + * If a fragment of this line was directly flushed to the + * console; wait for the console to pick up the rest of the + * line. LOG_NOCONS suppresses a duplicated output. + */ + log_store(cont.facility, cont.level, flags | LOG_NOCONS, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); + cont.flags = flags; + cont.flushed = true; + } else { + /* + * If no fragment of this line ever reached the console, + * just submit it to the store and free the buffer. + */ + log_store(cont.facility, cont.level, flags, 0, + NULL, 0, cont.buf, cont.len); + cont.len = 0; + } +} + +static bool cont_add(int facility, int level, const char *text, size_t len) +{ + if (cont.len && cont.flushed) + return false; + + if (cont.len + len > sizeof(cont.buf)) { + /* the line gets too long, split it up in separate records */ + cont_flush(LOG_CONT); + return false; + } + + if (!cont.len) { + cont.facility = facility; + cont.level = level; + cont.owner = current; + cont.ts_nsec = local_clock(); + cont.flags = 0; + cont.cons = 0; + cont.flushed = false; + } + + memcpy(cont.buf + cont.len, text, len); + cont.len += len; + + if (cont.len > (sizeof(cont.buf) * 80) / 100) + cont_flush(LOG_CONT); + + return true; +} + +static size_t cont_print_text(char *text, size_t size) +{ + size_t textlen = 0; + size_t len; + + if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { + textlen += print_time(cont.ts_nsec, text); + size -= textlen; + } + + len = cont.len - cont.cons; + if (len > 0) { + if (len+1 > size) + len = size-1; + memcpy(text + textlen, cont.buf + cont.cons, len); + textlen += len; + cont.cons = cont.len; + } + + if (cont.flushed) { + if (cont.flags & LOG_NEWLINE) + text[textlen++] = '\n'; + /* got everything, release buffer */ + cont.len = 0; + } + return textlen; +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { static int recursion_bug; - static char cont_buf[LOG_LINE_MAX]; - static size_t cont_len; - static int cont_level; - static struct task_struct *cont_task; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; + enum log_flags lflags = 0; unsigned long flags; int this_cpu; - bool newline = false; - bool prefix = false; int printed_len = 0; boot_delay_msec(); @@ -1313,7 +1528,8 @@ asmlinkage int vprintk_emit(int facility, int level, recursion_bug = 0; printed_len += strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, NULL, 0, recursion_msg, printed_len); + log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, printed_len); } /* @@ -1325,81 +1541,67 @@ asmlinkage int vprintk_emit(int facility, int level, /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { text_len--; - newline = true; - } - - /* strip syslog prefix and extract log level or control flags */ - if (text[0] == '<' && text[1] && text[2] == '>') { - switch (text[1]) { - case '0' ... '7': - if (level == -1) - level = text[1] - '0'; - case 'd': /* KERN_DEFAULT */ - prefix = true; - case 'c': /* KERN_CONT */ - text += 3; - text_len -= 3; + lflags |= LOG_NEWLINE; + } + + /* strip kernel syslog prefix and extract log level or control flags */ + if (facility == 0) { + int kern_level = printk_get_level(text); + + if (kern_level) { + const char *end_of_header = printk_skip_level(text); + switch (kern_level) { + case '0' ... '7': + if (level == -1) + level = kern_level - '0'; + case 'd': /* KERN_DEFAULT */ + lflags |= LOG_PREFIX; + case 'c': /* KERN_CONT */ + break; + } + text_len -= end_of_header - text; + text = (char *)end_of_header; } } if (level == -1) level = default_message_loglevel; - if (dict) { - prefix = true; - newline = true; - } - - if (!newline) { - if (cont_len && (prefix || cont_task != current)) { - /* - * Flush earlier buffer, which is either from a - * different thread, or when we got a new prefix. - */ - log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); - cont_len = 0; - } + if (dict) + lflags |= LOG_PREFIX|LOG_NEWLINE; - if (!cont_len) { - cont_level = level; - cont_task = current; - } + if (!(lflags & LOG_NEWLINE)) { + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or another task also prints continuation lines. + */ + if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) + cont_flush(LOG_NEWLINE); - /* buffer or append to earlier buffer from the same thread */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; + /* buffer line if possible, otherwise store it right away */ + if (!cont_add(facility, level, text, text_len)) + log_store(facility, level, lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); } else { - if (cont_len && cont_task == current) { - if (prefix) { - /* - * New prefix from the same thread; flush. We - * either got no earlier newline, or we race - * with an interrupt. - */ - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - } + bool stored = false; - /* append to the earlier buffer and flush */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - cont_task = NULL; - printed_len = cont_len; - } else { - /* ordinary single and terminated line */ - log_store(facility, level, - dict, dictlen, text, text_len); - printed_len = text_len; + /* + * If an earlier newline was missing and it was the same task, + * either merge it with the current buffer and flush, or if + * there was a race with interrupts (prefix == true) then just + * flush it out and store this line separately. + */ + if (cont.len && cont.owner == current) { + if (!(lflags & LOG_PREFIX)) + stored = cont_add(facility, level, text, text_len); + cont_flush(LOG_NEWLINE); } + + if (!stored) + log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); } + printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. @@ -1483,14 +1685,32 @@ asmlinkage int printk(const char *fmt, ...) } EXPORT_SYMBOL(printk); -#else +#else /* CONFIG_PRINTK */ +#define LOG_LINE_MAX 0 +#define PREFIX_MAX 0 #define LOG_LINE_MAX 0 +static u64 syslog_seq; +static u32 syslog_idx; +static u64 console_seq; +static u32 console_idx; +static enum log_flags syslog_prev; +static u64 log_first_seq; +static u32 log_first_idx; +static u64 log_next_seq; +static enum log_flags console_prev; +static struct cont { + size_t len; + size_t cons; + u8 level; + bool flushed:1; +} cont; static struct log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) { return 0; } +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { return 0; } +static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ @@ -1762,9 +1982,34 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } -/* the next printk record to write to the console */ -static u64 console_seq; -static u32 console_idx; +static void console_cont_flush(char *text, size_t size) +{ + unsigned long flags; + size_t len; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + + if (!cont.len) + goto out; + + /* + * We still queue earlier records, likely because the console was + * busy. The earlier ones need to be printed before this one, we + * did not flush any fragment so far, so just let it queue up. + */ + if (console_seq < log_next_seq && !cont.cons) + goto out; + + len = cont_print_text(text, size); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + return; +out: + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} /** * console_unlock - unlock the console system @@ -1782,6 +2027,7 @@ static u32 console_idx; */ void console_unlock(void) { + static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1794,10 +2040,11 @@ void console_unlock(void) console_may_schedule = 0; + /* flush buffered message fragment immediately to console */ + console_cont_flush(text, sizeof(text)); again: for (;;) { struct log *msg; - static char text[LOG_LINE_MAX]; size_t len; int level; @@ -1811,18 +2058,36 @@ again: /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; + console_prev = 0; } - +skip: if (console_seq == log_next_seq) break; msg = log_from_idx(console_idx); - level = msg->level & 7; - - len = msg_print_text(msg, false, text, sizeof(text)); + if (msg->flags & LOG_NOCONS) { + /* + * Skip record we have buffered and already printed + * directly to the console when we received it. + */ + console_idx = log_next(console_idx); + console_seq++; + /* + * We will get here again when we register a new + * CON_PRINTBUFFER console. Clear the flag so we + * will properly dump everything later. + */ + msg->flags &= ~LOG_NOCONS; + console_prev = msg->flags; + goto skip; + } + level = msg->level; + len = msg_print_text(msg, console_prev, false, + text, sizeof(text)); console_idx = log_next(console_idx); console_seq++; + console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ @@ -2085,6 +2350,7 @@ void register_console(struct console *newcon) raw_spin_lock_irqsave(&logbuf_lock, flags); console_seq = syslog_seq; console_idx = syslog_idx; + console_prev = syslog_prev; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -2300,48 +2566,256 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer(). */ void kmsg_dump(enum kmsg_dump_reason reason) { - u64 idx; struct kmsg_dumper *dumper; - const char *s1, *s2; - unsigned long l1, l2; unsigned long flags; if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) return; - /* Theoretically, the log could move on after we do this, but - there's not a lot we can do about that. The new messages - will overwrite the start of what we dump. */ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + if (dumper->max_reason && reason > dumper->max_reason) + continue; + + /* initialize iterator with data about the stored records */ + dumper->active = true; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); + + /* reset iterator */ + dumper->active = false; + } + rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + * + * The function is similar to kmsg_dump_get_line(), but grabs no locks. + */ +bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + struct log *msg; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= log_next_seq) + goto out; + + msg = log_from_idx(dumper->cur_idx); + l = msg_print_text(msg, 0, syslog, line, size); + + dumper->cur_idx = log_next(dumper->cur_idx); + dumper->cur_seq++; + ret = true; +out: + if (len) + *len = l; + return ret; +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + bool ret; raw_spin_lock_irqsave(&logbuf_lock, flags); - if (syslog_seq < log_first_seq) - idx = syslog_idx; - else - idx = log_first_idx; + ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); - if (idx > log_next_idx) { - s1 = log_buf; - l1 = log_next_idx; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); - s2 = log_buf + idx; - l2 = log_buf_len - idx; - } else { - s1 = ""; - l1 = 0; +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @buf: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + unsigned long flags; + u64 seq; + u32 idx; + u64 next_seq; + u32 next_idx; + enum log_flags prev; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } - s2 = log_buf + idx; - l2 = log_next_idx - idx; + /* last entry */ + if (dumper->cur_seq >= dumper->next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; } + + /* calculate length of entire buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + prev = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, prev, true, NULL, 0); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + + /* move first record forward until length fits into the buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + prev = 0; + while (l > size && seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l -= msg_print_text(msg, prev, true, NULL, 0); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + + /* last message in next interation */ + next_seq = seq; + next_idx = idx; + + l = 0; + prev = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, prev, syslog, buf + l, size - l); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + + dumper->next_seq = next_seq; + dumper->next_idx = next_idx; + ret = true; raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) - dumper->dump(dumper, reason, s1, l1, s2, l2); - rcu_read_unlock(); +/** + * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + * + * The function is similar to kmsg_dump_rewind(), but grabs no locks. + */ +void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +{ + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; +} + +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + kmsg_dump_rewind_nolock(dumper); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); } +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 95cba41..4e6a61b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -54,6 +54,50 @@ #ifdef CONFIG_PREEMPT_RCU /* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 37a5444..547b1fe 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -172,7 +172,7 @@ void rcu_irq_enter(void) local_irq_restore(flags); } -#ifdef CONFIG_PROVE_RCU +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Test whether RCU thinks that the current CPU is idle. @@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* * Test whether the current CPU was interrupted from idle. Nested diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d..918fd1e 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { RCU_TRACE(.rcb.name = "rcu_preempt") }; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(void); static void rcu_report_exp_done(void); @@ -351,8 +350,9 @@ static int rcu_initiate_boost(void) rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_callbacks(); - } else + } else { RCU_TRACE(rcu_initiate_boost_trace()); + } return 1; } @@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void) } /* - * Tiny-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) } /* - * Tiny-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -/* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the rcu_preempt_ctrlblk structure, which is * checked elsewhere. This is called from the scheduling-clock interrupt. @@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void) rpcp->exp_tasks = NULL; /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) + if (!rcu_preempted_readers_exp()) { local_irq_restore(flags); - else { + } else { rcu_initiate_boost(); local_irq_restore(flags); wait_event(sync_rcu_preempt_exp_wq, @@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); */ int rcu_preempt_needs_cpu(void) { - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; } diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e66b34a..25b1503 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -49,8 +49,7 @@ #include <asm/byteorder.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " - "Josh Triplett <josh@freedesktop.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static bool barrier_phase; /* Test phase. */ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); @@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p) if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; rcu_torture_free(rp); - } else + } else { cur_ops->deferred_free(rp); + } } static int rcu_no_completed(void) @@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void) synchronize_srcu(&srcu_ctl); } +static void srcu_torture_call(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + call_srcu(&srcu_ctl, head, func); +} + +static void srcu_torture_barrier(void) +{ + srcu_barrier(&srcu_ctl); +} + static int srcu_torture_stats(char *page) { int cnt = 0; @@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .name = "srcu" }; @@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg) do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); - cur_ops->sync(); + if (cur_ops->cb_barrier != NULL && + rcu_random(&rand) % (nfakewriters * 8) == 0) + cur_ops->cb_barrier(); + else + cur_ops->sync(); rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld " - "barrier: %ld/%ld:%ld", + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free), + atomic_read(&n_rcu_torture_free)); + cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror, + n_rcu_torture_boost_rterror); + cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers, + n_rcu_torture_timers); + cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", n_online_successes, n_online_attempts, n_offline_successes, - n_offline_attempts, + n_offline_attempts); + cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); @@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg) delta = shutdown_time - jiffies_snap; if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu " - "jiffies remaining\n", + "rcu_torture_shutdown task: %lu jiffies remaining\n", torture_type, delta); schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); @@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg) if (cpu_down(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "offlined %d\n", + "rcu_torture_onoff task: offlined %d\n", torture_type, cpu); n_offline_successes++; } @@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg) if (cpu_up(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "onlined %d\n", + "rcu_torture_onoff task: onlined %d\n", torture_type, cpu); n_online_successes++; } @@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; + bool lastphase = 0; struct rcu_head rcu; init_rcu_head_on_stack(&rcu); @@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg) set_user_nice(current, 19); do { wait_event(barrier_cbs_wq[myid], - atomic_read(&barrier_cbs_count) == n_barrier_cbs || + barrier_phase != lastphase || kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP); + lastphase = barrier_phase; + smp_mb(); /* ensure barrier_phase load before ->call(). */ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); @@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg) do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); - /* wake_up() path contains the required barriers. */ + smp_mb(); /* Ensure barrier_phase after prior assignments. */ + barrier_phase = !barrier_phase; for (i = 0; i < n_barrier_cbs; i++) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, @@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg) schedule_timeout_interruptible(HZ / 10); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + rcutorture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); return 0; @@ -1908,8 +1925,8 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, - &srcu_raw_sync_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, + &srcu_raw_ops, &srcu_raw_sync_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1931,8 +1948,7 @@ rcu_torture_init(void) return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " - "fqs_duration, fqs disabled.\n"); + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88..f280e54 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -60,36 +60,44 @@ /* Data structures. */ -static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; - -#define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname##_state.node[0] }, \ - .levelcnt = { \ - NUM_RCU_LVL_0, /* root of hierarchy. */ \ - NUM_RCU_LVL_1, \ - NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, \ - NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ - }, \ +static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + +#define RCU_STATE_INITIALIZER(sname, cr) { \ + .level = { &sname##_state.node[0] }, \ + .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ - .orphan_nxttail = &structname##_state.orphan_nxtlist, \ - .orphan_donetail = &structname##_state.orphan_donelist, \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ - .n_force_qs = 0, \ - .n_force_qs_ngp = 0, \ - .name = #structname, \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .orphan_nxttail = &sname##_state.orphan_nxtlist, \ + .orphan_donetail = &sname##_state.orphan_donelist, \ + .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ + .name = #sname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); +struct rcu_state rcu_sched_state = + RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +LIST_HEAD(rcu_struct_flavors); + +/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +module_param(rcu_fanout_leaf, int, 0); +int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; +static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ + NUM_RCU_LVL_0, + NUM_RCU_LVL_1, + NUM_RCU_LVL_2, + NUM_RCU_LVL_3, + NUM_RCU_LVL_4, +}; +int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* * The rcu_scheduler_active variable transitions from zero to one just @@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; -/* State information for rcu_barrier() and friends. */ - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) trace_rcu_dyntick("Error on exit: not idle task", oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -584,8 +586,6 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } -#ifdef CONFIG_PROVE_RCU - /** * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * @@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* * Is the current CPU online? Disable preemption to avoid false positives @@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void) } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ /** * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle @@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) int cpu; long delta; unsigned long flags; - int ndetected; + int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); /* Only let one CPU complain about others per time interval. */ @@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) */ rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected = rcu_print_task_stall(rnp); + ndetected += rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); @@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) */ void rcu_cpu_stall_reset(void) { - rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_preempt_stall_reset(); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } static struct notifier_block rcu_panic_block = { @@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct if (rnp->qsmask & rdp->grpmask) { rdp->qs_pending = 1; rdp->passed_quiesce = 0; - } else + } else { rdp->qs_pending = 0; + } zero_cpu_stall_ticks(rdp); } } @@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) } /* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + int i; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; +} + +/* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp * belongs. In addition, the corresponding leaf rcu_node structure's @@ -1327,8 +1339,6 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; - /* * Orphan the callbacks. First adjust the counts. This is safe * because ->onofflock excludes _rcu_barrier()'s adoption of @@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->qlen += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; } /* @@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, } /* Finally, initialize the rcu_data structure's list to empty. */ - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); } /* @@ -1397,6 +1405,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; rdp->n_cbs_adopted += rsp->qlen; + if (rsp->qlen_lazy != rsp->qlen) + rcu_idle_count_callbacks_posted(); rsp->qlen_lazy = 0; rsp->qlen = 0; @@ -1502,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp, true); + WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", + cpu, rdp->qlen, rdp->nxtlist); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1528,7 +1541,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy; + int bl, count, count_lazy, i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { @@ -1551,9 +1564,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; tail = rdp->nxttail[RCU_DONE_TAIL]; - for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) - if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[count] = &rdp->nxtlist; + for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = &rdp->nxtlist; local_irq_restore(flags); /* Invoke callbacks. */ @@ -1581,15 +1594,15 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; - for (count = 0; count < RCU_NEXT_SIZE; count++) - if (&rdp->nxtlist == rdp->nxttail[count]) - rdp->nxttail[count] = tail; + for (i = 0; i < RCU_NEXT_SIZE; i++) + if (&rdp->nxtlist == rdp->nxttail[i]) + rdp->nxttail[i] = tail; else break; } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - rdp->qlen -= count; + ACCESS_ONCE(rdp->qlen) -= count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -1602,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = rdp->qlen; + WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); local_irq_restore(flags); @@ -1742,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: - if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) - break; /* So gcc recognizes the dead code. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ @@ -1785,9 +1797,10 @@ unlock_fqs_ret: * whom the rdp belongs. */ static void -__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +__rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -1823,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_process_callbacks(struct softirq_action *unused) { + struct rcu_state *rsp; + trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state, - &__get_cpu_var(rcu_sched_data)); - __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); - rcu_preempt_process_callbacks(); + for_each_rcu_flavor(rsp) + __rcu_process_callbacks(rsp); trace_rcu_utilization("End RCU core"); } @@ -1854,6 +1867,56 @@ static void invoke_rcu_core(void) raise_softirq(RCU_SOFTIRQ); } +/* + * Handle any core-RCU processing required by a call_rcu() invocation. + */ +static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, + struct rcu_head *head, unsigned long flags) +{ + /* + * If called from an extended quiescent state, invoke the RCU + * core in order to force a re-evaluation of RCU's idleness. + */ + if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) + invoke_rcu_core(); + + /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ + if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) + return; + + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + + /* Are we ignoring a completed grace period? */ + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } + } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) + force_quiescent_state(rsp, 1); +} + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp, bool lazy) @@ -1878,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - rdp->qlen++; + ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; else @@ -1893,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), else trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); - /* If interrupts were disabled, don't dive into RCU core. */ - if (irqs_disabled_flags(flags)) { - local_irq_restore(flags); - return; - } - - /* - * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() - * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback - * is the only one waiting for a grace period to complete. - */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - - /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); - - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ - } else { - /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; - } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); + /* Go handle any RCU core processing required. */ + __call_rcu_core(rsp, rdp, head, flags); local_irq_restore(flags); } @@ -1959,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * occasionally incorrectly indicate that there are multiple CPUs online * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. - * - * Of course, sampling num_online_cpus() with preemption enabled can - * give erroneous results if there are concurrent CPU-hotplug operations. - * For example, given a demonic sequence of preemptions in num_online_cpus() - * and CPU-hotplug operations, there could be two or more CPUs online at - * all times, but num_online_cpus() might well return one (or even zero). - * - * However, all such demonic sequences require at least one CPU-offline - * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer - * is only a problem if there is an RCU read-side critical section executing - * throughout. But RCU-sched and RCU-bh read-side critical sections - * disable either preemption or bh, which prevents a CPU from going offline. - * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return - * that there is only one CPU when in fact there was more than one throughout - * is when there were no RCU readers in the system. If there are no - * RCU readers, the grace period by definition can be of zero length, - * regardless of the number of online CPUs. */ static inline int rcu_blocking_is_gp(void) { + int ret; + might_sleep(); /* Check for RCU read-side critical section. */ - return num_online_cpus() <= 1; + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; } /** @@ -2115,9 +2131,9 @@ void synchronize_sched_expedited(void) put_online_cpus(); /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_sched(); return; } @@ -2238,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_pending(int cpu) { - return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_pending(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + return 1; + return 0; } /* @@ -2250,20 +2269,41 @@ static int rcu_pending(int cpu) */ static int rcu_cpu_has_callbacks(int cpu) { + struct rcu_state *rsp; + /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_sched_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_cpu_has_callbacks(cpu); + for_each_rcu_flavor(rsp) + if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) + return 1; + return 0; +} + +/* + * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * the compiler is expected to optimize this away. + */ +static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, + int cpu, unsigned long done) +{ + trace_rcu_barrier(rsp->name, s, cpu, + atomic_read(&rsp->barrier_cpu_count), done); } /* * RCU callback function for _rcu_barrier(). If we are last, wake * up the task executing _rcu_barrier(). */ -static void rcu_barrier_callback(struct rcu_head *notused) +static void rcu_barrier_callback(struct rcu_head *rhp) { - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); + struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); + struct rcu_state *rsp = rdp->rsp; + + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + complete(&rsp->barrier_completion); + } else { + _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + } } /* @@ -2271,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused) */ static void rcu_barrier_func(void *type) { - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); + struct rcu_state *rsp = type; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + rsp->call(&rdp->barrier_head, rcu_barrier_callback); } /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(struct rcu_state *rsp, - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) +static void _rcu_barrier(struct rcu_state *rsp) { int cpu; unsigned long flags; struct rcu_data *rdp; - struct rcu_head rh; + struct rcu_data rd; + unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); + unsigned long snap_done; - init_rcu_head_on_stack(&rh); + init_rcu_head_on_stack(&rd.barrier_head); + _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rcu_barrier_mutex); + mutex_lock(&rsp->barrier_mutex); + + /* + * Ensure that all prior references, including to ->n_barrier_done, + * are ordered before the _rcu_barrier() machinery. + */ + smp_mb(); /* See above block comment. */ + + /* + * Recheck ->n_barrier_done to see if others did our work for us. + * This means checking ->n_barrier_done for an even-to-odd-to-even + * transition. The "if" expression below therefore rounds the old + * value up to the next even number and adds two before comparing. + */ + snap_done = ACCESS_ONCE(rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "Check", -1, snap_done); + if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + smp_mb(); /* caller's subsequent code after above check. */ + mutex_unlock(&rsp->barrier_mutex); + return; + } - smp_mb(); /* Prevent any prior operations from leaking in. */ + /* + * Increment ->n_barrier_done to avoid duplicate work. Use + * ACCESS_ONCE() to prevent the compiler from speculating + * the increment to precede the early-exit check. + */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); + smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ /* * Initialize the count to one rather than to zero in order to @@ -2318,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp, * 6. Both rcu_barrier_callback() callbacks are invoked, awakening * us -- but before CPU 1's orphaned callbacks are invoked!!! */ - init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 1); + init_completion(&rsp->barrier_completion); + atomic_set(&rsp->barrier_cpu_count, 1); raw_spin_lock_irqsave(&rsp->onofflock, flags); rsp->rcu_barrier_in_progress = current; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); @@ -2335,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp, preempt_disable(); rdp = per_cpu_ptr(rsp->rda, cpu); if (cpu_is_offline(cpu)) { + _rcu_barrier_trace(rsp, "Offline", cpu, + rsp->n_barrier_done); preempt_enable(); while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) schedule_timeout_interruptible(1); } else if (ACCESS_ONCE(rdp->qlen)) { - smp_call_function_single(cpu, rcu_barrier_func, - (void *)call_rcu_func, 1); + _rcu_barrier_trace(rsp, "OnlineQ", cpu, + rsp->n_barrier_done); + smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); preempt_enable(); } else { + _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + rsp->n_barrier_done); preempt_enable(); } } @@ -2359,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp, rcu_adopt_orphan_cbs(rsp); rsp->rcu_barrier_in_progress = NULL; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rcu_barrier_cpu_count); + atomic_inc(&rsp->barrier_cpu_count); smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - call_rcu_func(&rh, rcu_barrier_callback); + rd.rsp = rsp; + rsp->call(&rd.barrier_head, rcu_barrier_callback); /* * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) + complete(&rsp->barrier_completion); + + /* Increment ->n_barrier_done to prevent duplicate work. */ + smp_mb(); /* Keep increment after above mechanism. */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); + smp_mb(); /* Keep increment before caller's subsequent code. */ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rcu_barrier_completion); + wait_for_completion(&rsp->barrier_completion); /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rcu_barrier_mutex); + mutex_unlock(&rsp->barrier_mutex); - destroy_rcu_head_on_stack(&rh); + destroy_rcu_head_on_stack(&rd.barrier_head); } /** @@ -2384,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp, */ void rcu_barrier_bh(void) { - _rcu_barrier(&rcu_bh_state, call_rcu_bh); + _rcu_barrier(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -2393,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(&rcu_sched_state, call_rcu_sched); + _rcu_barrier(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); @@ -2404,18 +2485,15 @@ static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - int i; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -2489,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) static void __cpuinit rcu_prepare_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_sched_state, 0); - rcu_init_percpu_data(cpu, &rcu_bh_state, 0); - rcu_preempt_init_percpu_data(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_init_percpu_data(cpu, rsp, + strcmp(rsp->name, "rcu_preempt") == 0); } /* @@ -2503,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + struct rcu_state *rsp; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2527,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_cleanup_dying_cpu(&rcu_bh_state); - rcu_cleanup_dying_cpu(&rcu_sched_state); - rcu_preempt_cleanup_dying_cpu(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); - rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); - rcu_preempt_cleanup_dead_cpu(cpu); + for_each_rcu_flavor(rsp) + rcu_cleanup_dead_cpu(cpu, rsp); break; default: break; @@ -2571,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i > 0; i--) + for (i = rcu_num_lvls - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; + rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -2583,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int i; cprv = NR_CPUS; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; @@ -2610,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ - for (i = 1; i < NUM_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) + rsp->levelcnt[i] = num_rcu_lvl[i]; + for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); /* Initialize the elements themselves, starting from the leaves. */ - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { @@ -2646,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; - rnp = rsp->level[NUM_RCU_LVLS - 1]; + rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } + list_add(&rsp->flavors, &rcu_struct_flavors); +} + +/* + * Compute the rcu_node tree geometry from kernel parameters. This cannot + * replace the definitions in rcutree.h because those are needed to size + * the ->node array in the rcu_state structure. + */ +static void __init rcu_init_geometry(void) +{ + int i; + int j; + int n = nr_cpu_ids; + int rcu_capacity[MAX_RCU_LVLS + 1]; + + /* If the compile-time values are accurate, just leave. */ + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + return; + + /* + * Compute number of nodes that can be handled an rcu_node tree + * with the given number of levels. Setting rcu_capacity[0] makes + * some of the arithmetic easier. + */ + rcu_capacity[0] = 1; + rcu_capacity[1] = rcu_fanout_leaf; + for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + + /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Finally, the tree must be able to accommodate + * the configured number of CPUs. Complain and fall back to the + * compile-time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8 || + n > rcu_capacity[MAX_RCU_LVLS]) { + WARN_ON(1); + return; + } + + /* Calculate the number of rcu_nodes at each level of the tree. */ + for (i = 1; i <= MAX_RCU_LVLS; i++) + if (n <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) + num_rcu_lvl[j] = + DIV_ROUND_UP(n, rcu_capacity[i - j]); + rcu_num_lvls = i; + for (j = i + 1; j <= MAX_RCU_LVLS; j++) + num_rcu_lvl[j] = 0; + break; + } + + /* Calculate the total number of rcu_node structures. */ + rcu_num_nodes = 0; + for (i = 0; i <= MAX_RCU_LVLS; i++) + rcu_num_nodes += num_rcu_lvl[i]; + rcu_num_nodes -= n; } void __init rcu_init(void) @@ -2660,6 +2802,7 @@ void __init rcu_init(void) int cpu; rcu_bootup_announce(); + rcu_init_geometry(); rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138..4d29169 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -42,28 +42,28 @@ #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT_1 -# define NUM_RCU_LVLS 1 +# define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_2 -# define NUM_RCU_LVLS 2 +# define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_3 -# define NUM_RCU_LVLS 3 +# define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_4 -# define NUM_RCU_LVLS 4 +# define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) @@ -76,6 +76,9 @@ #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) +extern int rcu_num_lvls; +extern int rcu_num_nodes; + /* * Dynticks per-CPU state. */ @@ -84,6 +87,21 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_RCU_FAST_NO_HZ + int dyntick_drain; /* Prepare-for-idle state variable. */ + unsigned long dyntick_holdoff; + /* No retries for the jiffy of failure. */ + struct timer_list idle_gp_timer; + /* Wake up CPU sleeping with callbacks. */ + unsigned long idle_gp_timer_expires; + /* When to wake up CPU (for repost). */ + bool idle_first_pass; /* First pass of attempt to go idle? */ + unsigned long nonlazy_posted; + /* # times non-lazy CBs posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* idle-period nonlazy_posted snapshot. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; /* RCU's kthread states for tracing. */ @@ -192,7 +210,7 @@ struct rcu_node { */ #define rcu_for_each_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* * Do a breadth-first scan of the non-leaf rcu_node structures for the @@ -201,7 +219,7 @@ struct rcu_node { */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -210,8 +228,8 @@ struct rcu_node { * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ @@ -297,6 +315,9 @@ struct rcu_data { unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; + /* 6) _rcu_barrier() callback. */ + struct rcu_head barrier_head; + int cpu; struct rcu_state *rsp; }; @@ -343,10 +364,12 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ + void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ + void (*func)(struct rcu_head *head)); /* The following fields are guarded by the root rcu_node's lock. */ @@ -378,6 +401,11 @@ struct rcu_state { struct task_struct *rcu_barrier_in_progress; /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ + atomic_t barrier_cpu_count; /* # CPUs waiting on. */ + struct completion barrier_completion; /* Wake at barrier end. */ + unsigned long n_barrier_done; /* ++ at start and end of */ + /* _rcu_barrier(). */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -395,8 +423,13 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + struct list_head flavors; /* List of RCU flavors. */ }; +extern struct list_head rcu_struct_flavors; +#define for_each_rcu_flavor(rsp) \ + list_for_each_entry((rsp), &rcu_struct_flavors, flavors) + /* Return values for rcu_preempt_offline_tasks(). */ #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ @@ -430,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -438,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); -static void rcu_preempt_stall_reset(void); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_cleanup_dead_cpu(int cpu); static void rcu_preempt_check_callbacks(int cpu); -static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ -static int rcu_preempt_pending(int cpu); -static int rcu_preempt_cpu_has_callbacks(int cpu); -static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000..7f3244c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); + printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); #endif + if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) + printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + if (nr_cpu_ids != NR_CPUS) + printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); } #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); +struct rcu_state rcu_preempt_state = + RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = __this_cpu_ptr(rcu_preempt_state.rda); + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(cpu); local_irq_restore(flags); } /* - * Tree-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* * Check for preempted RCU readers blocking the current grace period * for the specified rcu_node structure. If the caller needs a reliable * answer, it must hold the rcu_node's ->lock. @@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) rnp->grphi, !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rnp, flags); - } else + } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); + } #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ @@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) } } -/* - * Tree-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* @@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* - * Suppress preemptible RCU's CPU stall warnings by pushing the - * time of the next stall-warning message comfortably far into the - * future. - */ -static void rcu_preempt_stall_reset(void) -{ - rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; -} - -/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Do CPU-offline processing for preemptible RCU. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ - rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); -} - -/* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, * which is checked elsewhere. @@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_state, - &__get_cpu_var(rcu_preempt_data)); -} - #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void) @@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); - if (list_empty(&rnp->blkd_tasks)) + if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - else { + } else { rnp->exp_tasks = rnp->blkd_tasks.next; rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ must_wait = 1; @@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void) * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_rcu(); return; } @@ -917,51 +851,16 @@ mb_ret: } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -/* - * Check to see if there is any immediate preemptible-RCU-related work - * to be done. - */ -static int rcu_preempt_pending(int cpu) -{ - return __rcu_pending(&rcu_preempt_state, - &per_cpu(rcu_preempt_data, cpu)); -} - -/* - * Does preemptible RCU have callbacks on this CPU? - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return !!per_cpu(rcu_preempt_data, cpu).nxtlist; -} - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. */ void rcu_barrier(void) { - _rcu_barrier(&rcu_preempt_state, call_rcu); + _rcu_barrier(&rcu_preempt_state); } EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Initialize preemptible RCU's per-CPU data. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ - rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); -} - -/* - * Move preemptible RCU's callbacks from dying CPU to other online CPU - * and record a quiescent state. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ - rcu_cleanup_dying_cpu(&rcu_preempt_state); -} - -/* * Initialize preemptible RCU's state structures. */ static void __init __rcu_init_preempt(void) @@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +/* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ @@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* - * Because preemptible RCU does not exist, there is no need to suppress - * its CPU stall warnings. - */ -static void rcu_preempt_stall_reset(void) -{ -} - -/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. @@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptible RCU does not exist, it never needs CPU-offline - * processing. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ -} - -/* * Because preemptible RCU does not exist, it never has any callbacks * to check. */ @@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - -/* * Queue an RCU callback for lazy invocation after a grace period. * This will likely be later named something like "call_rcu_lazy()", * but this change will require some way of tagging the lazy RCU @@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptible RCU does not exist, it never has any work to do. - */ -static int rcu_preempt_pending(int cpu) -{ - return 0; -} - -/* - * Because preemptible RCU does not exist, it never has callbacks - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return 0; -} - -/* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ @@ -1163,21 +1030,6 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Because preemptible RCU does not exist, there is no per-CPU - * data to initialize. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ -} - -/* - * Because there is no preemptible RCU, there is no cleanup to do. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ -} - -/* * Because preemptible RCU does not exist, it need not be initialized. */ static void __init __rcu_init_preempt(void) @@ -1886,8 +1738,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu); } @@ -1959,43 +1812,10 @@ static void rcu_idle_count_callbacks_posted(void) */ #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ -#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ +#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* Loop counter for rcu_prepare_for_idle(). */ -static DEFINE_PER_CPU(int, rcu_dyntick_drain); -/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ -static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); -/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ -static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); -/* Enable special processing on first attempt to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(bool, rcu_idle_first_pass); -/* Running count of non-lazy callbacks posted, never decremented. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); -/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); - -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - /* Flag a new idle sojourn to the idle-entry state machine. */ - per_cpu(rcu_idle_first_pass, cpu) = 1; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; -} +extern int tick_nohz_enabled; /* * Does the specified flavor of RCU have non-lazy callbacks pending on @@ -2040,6 +1860,50 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) } /* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks. The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU. This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Flag a new idle sojourn to the idle-entry state machine. */ + rdtp->idle_first_pass = 1; + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) { + *delta_jiffies = ULONG_MAX; + return 0; + } + if (rdtp->dyntick_holdoff == jiffies) { + /* RCU recently tried and failed, so don't try again. */ + *delta_jiffies = 1; + return 1; + } + /* Set up for the possibility that RCU will post a timer. */ + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, + RCU_IDLE_GP_DELAY) - jiffies; + } else { + *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; + *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + } + return 0; +} + +/* * Handler for smp_call_function_single(). The only point of this * handler is to wake the CPU up, so the handler does only tracing. */ @@ -2075,22 +1939,26 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) */ static void rcu_prepare_for_idle_init(int cpu) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_timer_func, cpu); - per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; - per_cpu(rcu_idle_first_pass, cpu) = 1; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + rdtp->dyntick_holdoff = jiffies - 1; + setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); + rdtp->idle_gp_timer_expires = jiffies - 1; + rdtp->idle_first_pass = 1; } /* * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * is no longer any point to ->idle_gp_timer, so cancel it. This will * do nothing if this timer is not active, so just cancel it unconditionally. */ static void rcu_cleanup_after_idle(int cpu) { - del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); + rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); } /* @@ -2108,42 +1976,53 @@ static void rcu_cleanup_after_idle(int cpu) * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * later. The ->dyntick_drain field controls the sequencing. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + int tne; + + /* Handle nohz enablement switches conservatively. */ + tne = ACCESS_ONCE(tick_nohz_enabled); + if (tne != rdtp->tick_nohz_enabled_snap) { + if (rcu_cpu_has_callbacks(cpu)) + invoke_rcu_core(); /* force nohz to see update. */ + rdtp->tick_nohz_enabled_snap = tne; + return; + } + if (!tne) + return; /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle * loop, then don't take any state-machine actions, unless the * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * Instead, repost the ->idle_gp_timer if this CPU has callbacks * pending. */ - if (!per_cpu(rcu_idle_first_pass, cpu) && - (per_cpu(rcu_nonlazy_posted, cpu) == - per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (!rdtp->idle_first_pass && + (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { if (rcu_cpu_has_callbacks(cpu)) { - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); } return; } - per_cpu(rcu_idle_first_pass, cpu) = 0; - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu) - 1; + rdtp->idle_first_pass = 0; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. */ if (!rcu_cpu_has_callbacks(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; + rdtp->dyntick_holdoff = jiffies - 1; + rdtp->dyntick_drain = 0; trace_rcu_prep_idle("No callbacks"); return; } @@ -2152,36 +2031,38 @@ static void rcu_prepare_for_idle(int cpu) * If in holdoff mode, just return. We will presumably have * refrained from disabling the scheduling-clock tick. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + if (rdtp->dyntick_holdoff == jiffies) { trace_rcu_prep_idle("In holdoff"); return; } - /* Check and update the rcu_dyntick_drain sequencing. */ - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* Check and update the ->dyntick_drain sequencing. */ + if (rdtp->dyntick_drain <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; - } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + rdtp->dyntick_drain = RCU_IDLE_FLUSHES; + } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) - per_cpu(rcu_idle_gp_timer_expires, cpu) = - jiffies + RCU_IDLE_GP_DELAY; - else - per_cpu(rcu_idle_gp_timer_expires, cpu) = - jiffies + RCU_IDLE_LAZY_GP_DELAY; - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu); + rdtp->dyntick_drain = 0; + rdtp->dyntick_holdoff = jiffies; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); + rdtp->idle_gp_timer_expires = + round_up(jiffies + RCU_IDLE_GP_DELAY, + RCU_IDLE_GP_DELAY); + } else { + rdtp->idle_gp_timer_expires = + round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); + trace_rcu_prep_idle("Dyntick with lazy callbacks"); + } + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; return; /* Nothing more to do immediately. */ - } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + } else if (--(rdtp->dyntick_drain) <= 0) { /* We have hit the limit, so time to give up. */ - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_holdoff = jiffies; trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2213,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_callbacks(cpu)) { trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else + } else { trace_rcu_prep_idle("Callbacks drained"); + } } /* @@ -2227,7 +2109,7 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_idle_count_callbacks_posted(void) { - __this_cpu_add(rcu_nonlazy_posted, 1); + __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -2238,11 +2120,12 @@ static void rcu_idle_count_callbacks_posted(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct timer_list *tltp = &rdtp->idle_gp_timer; sprintf(cp, "drain=%d %c timer=%lu", - per_cpu(rcu_dyntick_drain, cpu), - per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + rdtp->dyntick_drain, + rdtp->dyntick_holdoff == jiffies ? 'H' : '.', timer_pending(tltp) ? tltp->expires - jiffies : -1); } @@ -2250,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { + *cp = '\0'; } #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16d..abffb48 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,31 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +static int show_rcubarrier(struct seq_file *m, void *unused) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", + rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); + return 0; +} + +static int rcubarrier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcubarrier, NULL); +} + +static const struct file_operations rcubarrier_fops = { + .owner = THIS_MODULE, + .open = rcubarrier_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + #ifdef CONFIG_RCU_BOOST static char convert_kthread_status(unsigned int kthread_status) @@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -#define PRINT_RCU_DATA(name, func, m) \ - do { \ - int _p_r_d_i; \ - \ - for_each_possible_cpu(_p_r_d_i) \ - func(m, &per_cpu(name, _p_r_d_i)); \ - } while (0) - static int show_rcudata(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); - seq_puts(m, "rcu_bh:\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); + int cpu; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { + int cpu; + struct rcu_state *rsp; + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); @@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "\"rcu_preempt:\"\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "\"rcu_sched:\"\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); - seq_puts(m, "\"rcu_bh:\"\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); + for_each_rcu_flavor(rsp) { + seq_printf(m, "\"%s:\"\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = { static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) { - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " - "j=%04x bt=%04x\n", + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", rnp->grplo, rnp->grphi, "T."[list_empty(&rnp->blkd_tasks)], "N."[!rnp->gp_tasks], @@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) "B."[!rnp->boost_tasks], convert_kthread_status(rnp->boost_kthread_status), rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts, + rnp->n_normal_boosts); + seq_printf(m, "j=%04x bt=%04x\n", (int)(jiffies & 0xffff), (int)(rnp->boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - " balk", + seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", rnp->n_balk_blkd_tasks, rnp->n_balk_exp_gp_tasks, rnp->n_balk_boost_tasks, @@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->completed, gpnum, rsp->fqs_state, + seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", + rsp->name, rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff), + (int)(jiffies & 0xffff)); + seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { + for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); level = rnp->level; @@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) static int show_rcuhier(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_one_rcu_state(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_one_rcu_state(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_one_rcu_state(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + print_one_rcu_state(m, rsp); return 0; } @@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) static int show_rcugp(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - show_one_rcugp(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - show_one_rcugp(m, &rcu_sched_state); - show_one_rcugp(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + show_one_rcugp(m, rsp); return 0; } @@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { - seq_printf(m, "%3d%cnp=%ld " - "qsp=%ld rpq=%ld cbr=%ld cng=%ld " - "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + seq_printf(m, "%3d%cnp=%ld ", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending, + rdp->n_rcu_pending); + seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", rdp->n_rp_qs_pending, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp, + rdp->n_rp_cpu_needs_gp); + seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, rdp->n_rp_need_fqs, rdp->n_rp_need_nothing); } -static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +static int show_rcu_pending(struct seq_file *m, void *unused) { int cpu; struct rcu_data *rdp; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } } -} - -static int show_rcu_pending(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_rcu_pendings(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_rcu_pendings(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_rcu_pendings(m, &rcu_bh_state); return 0; } @@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void) if (!rcudir) goto free_out; + retval = debugfs_create_file("rcubarrier", 0444, rcudir, + NULL, &rcubarrier_fops); + if (!retval) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, NULL, &rcudata_fops); if (!retval) diff --git a/kernel/relay.c b/kernel/relay.c index ab56a17..e8cd202 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, struct splice_pipe_desc spd = { .pages = pages, .nr_pages = 0, + .nr_pages_max = PIPE_DEF_BUFFERS, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, @@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, ret += padding; out: - splice_shrink_spd(pipe, &spd); - return ret; + splice_shrink_spd(&spd); + return ret; } static ssize_t relay_file_splice_read(struct file *in, diff --git a/kernel/resource.c b/kernel/resource.c index e1d2b8e..34d4588 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -7,6 +7,8 @@ * Arbitrary resource management. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/export.h> #include <linux/errno.h> #include <linux/ioport.h> @@ -722,14 +724,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t write_lock(&resource_lock); + if (!parent) + goto skip; + if ((start < parent->start) || (end > parent->end)) goto out; - for (tmp = res->child; tmp; tmp = tmp->sibling) { - if ((tmp->start < start) || (tmp->end > end)) - goto out; - } - if (res->sibling && (res->sibling->start <= end)) goto out; @@ -741,6 +741,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t goto out; } +skip: + for (tmp = res->child; tmp; tmp = tmp->sibling) + if ((tmp->start < start) || (tmp->end > end)) + goto out; + res->start = start; res->end = end; result = 0; @@ -788,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { + int abort = 0; + write_lock(&resource_lock); - __reserve_region_with_split(root, start, end, name); + if (root->start > start || root->end < end) { + pr_err("requested range [0x%llx-0x%llx] not in root %pr\n", + (unsigned long long)start, (unsigned long long)end, + root); + if (start > root->end || end < root->start) + abort = 1; + else { + if (end > root->end) + end = root->end; + if (start < root->start) + start = root->start; + pr_err("fixing request to [0x%llx-0x%llx]\n", + (unsigned long long)start, + (unsigned long long)end); + } + dump_stack(); + } + if (!abort) + __reserve_region_with_split(root, start, end, name); write_unlock(&resource_lock); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4..82ad284 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -1910,12 +1910,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + trace_sched_switch(prev, next); sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); - trace_sched_switch(prev, next); } /** @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); @@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} -void calc_load_account_idle(struct rq *this_rq) +void calc_load_enter_idle(void) { + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void) { - long delta = 0; + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; /* - * Its got a race, we don't care... + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -2302,66 +2454,39 @@ static void calc_global_nohz(void) { long delta, active, n; - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * It could be the one fold was all it took, we done! - */ - if (time_before(jiffies, calc_load_update + 10)) - return; + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} + calc_load_update += n * LOAD_FREQ; + } -static inline long calc_load_fold_idle(void) -{ - return 0; + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks) calc_load_update += LOAD_FREQ; /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } @@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * End of global load-average stuff + */ + +/* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load * @@ -4210,9 +4340,7 @@ recheck: */ if (unlikely(policy == p->policy && (!rt_policy(policy) || param->sched_priority == p->rt_priority))) { - - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return 0; } @@ -5894,6 +6022,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * + * Iterate domains and sched_groups downward, assigning CPUs to be + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing + * due to random perturbation self canceling, ie sw buddies pull + * their counterpart to their CPU's hw counterpart. + * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -5907,8 +6040,40 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { + struct sched_domain *tmp = sd; + struct sched_group *sg, *prev; + bool right; + + /* + * Traverse to first CPU in group, and count hops + * to cpu from there, switching direction on each + * hop, never ever pointing the last CPU rightward. + */ + do { + id = cpumask_first(sched_domain_span(tmp)); + prev = sg = tmp->groups; + right = 1; + + while (cpumask_first(sched_group_cpus(sg)) != id) + sg = sg->next; + + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { + prev = sg; + sg = sg->next; + right = !right; + } + + /* A CPU went down, never point back to domain start. */ + if (right && cpumask_first(sched_group_cpus(sg->next)) == id) + right = false; + + sg = right ? sg->next : prev; + tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); + } while ((tmp = tmp->child)); + id = cpumask_first(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; @@ -6967,34 +7132,66 @@ match2: mutex_unlock(&sched_domains_mutex); } +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(true); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(false); + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } void __init sched_init_smp(void) @@ -7459,6 +7656,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -7473,6 +7671,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d72586f..23aa789 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -65,8 +65,8 @@ static int convert_prio(int prio) int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask) { - int idx = 0; - int task_pri = convert_prio(p->prio); + int idx = 0; + int task_pri = convert_prio(p->prio); if (task_pri >= MAX_RT_PRIO) return 0; @@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, */ void cpupri_set(struct cpupri *cp, int cpu, int newpri) { - int *currpri = &cp->cpu_to_pri[cpu]; - int oldpri = *currpri; - int do_mb = 0; + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + int do_mb = 0; newpri = convert_prio(newpri); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6e..d0cc03b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - struct sched_group *sg; - int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, check assigned siblings to find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + for_each_lower_domain(sd) { + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(sd->idle_buddy)) + return sd->idle_buddy; } -done: + return target; } @@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; - int src_cpu; struct rq *src_rq; + int src_cpu; int dst_cpu; struct rq *dst_rq; + struct cpumask *dst_grpmask; + int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + /* The set of CPUs under consideration for load-balancing */ + struct cpumask *cpus; + unsigned int flags; unsigned int loop; @@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int new_dst_cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + new_dst_cpu = cpumask_first_and(env->dst_grpmask, + tsk_cpus_allowed(p)); + if (new_dst_cpu < nr_cpu_ids) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = new_dst_cpu; + } return 0; } + + /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -3642,8 +3656,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, const struct cpumask *cpus, - int *balance, struct sg_lb_stats *sgs) + int local_group, int *balance, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; @@ -3660,7 +3673,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, max_nr_running = 0; min_nr_running = ~0UL; - for_each_cpu_and(i, sched_group_cpus(group), cpus) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); nr_running = rq->nr_running; @@ -3789,8 +3802,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, - const struct cpumask *cpus, - int *balance, struct sd_lb_stats *sds) + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -3807,8 +3819,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, - cpus, balance, &sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); if (local_group && !(*balance)) return; @@ -4044,7 +4055,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * to restore balance. * * @env: The load balancing environment. - * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. * @@ -4054,7 +4064,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * -find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) +find_busiest_group(struct lb_env *env, int *balance) { struct sd_lb_stats sds; @@ -4064,7 +4074,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(env, cpus, balance, &sds); + update_sd_lb_stats(env, balance, &sds); /* * this_cpu is not the appropriate cpu to perform load balancing at @@ -4144,8 +4154,7 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq *find_busiest_queue(struct lb_env *env, - struct sched_group *group, - const struct cpumask *cpus) + struct sched_group *group) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -4160,7 +4169,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (!capacity) capacity = fix_small_capacity(env->sd, group); - if (!cpumask_test_cpu(i, cpus)) + if (!cpumask_test_cpu(i, env->cpus)) continue; rq = cpu_rq(i); @@ -4227,7 +4236,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -4237,16 +4247,19 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, + .cpus = cpus, }; cpumask_copy(cpus, cpu_active_mask); + max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(&env, cpus, balance); + group = find_busiest_group(&env, balance); if (*balance == 0) goto out_balanced; @@ -4256,7 +4269,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(&env, group, cpus); + busiest = find_busiest_queue(&env, group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -4267,6 +4280,7 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; + lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -4284,7 +4298,13 @@ more_balance: double_rq_lock(this_rq, busiest); if (!env.loop) update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4296,14 +4316,52 @@ more_balance: /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && + lb_iterations++ < max_lb_iterations) { + + this_rq = cpu_rq(env.new_dst_cpu); + env.dst_rq = this_rq; + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604..b6baf37 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea..c35a1a7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -942,8 +939,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/signal.c b/kernel/signal.c index 6771027..be4f856 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why) void ptrace_notify(int exit_code) { BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + if (unlikely(current->task_works)) { + if (test_and_clear_ti_thread_flag(current_thread_info(), + TIF_NOTIFY_RESUME)) { + smp_mb__after_clear_bit(); + task_work_run(); + } + } spin_lock_irq(¤t->sighand->siglock); ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); @@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; + if (unlikely(current->task_works)) { + if (test_and_clear_ti_thread_flag(current_thread_info(), + TIF_NOTIFY_RESUME)) { + smp_mb__after_clear_bit(); + task_work_run(); + } + } + if (unlikely(uprobe_deny_signal())) return 0; diff --git a/kernel/smp.c b/kernel/smp.c index d0ae5b2..29dd40a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); - -void ipi_call_lock(void) -{ - raw_spin_lock(&call_function.lock); -} - -void ipi_call_unlock(void) -{ - raw_spin_unlock(&call_function.lock); -} - -void ipi_call_lock_irq(void) -{ - raw_spin_lock_irq(&call_function.lock); -} - -void ipi_call_unlock_irq(void) -{ - raw_spin_unlock_irq(&call_function.lock); -} #endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acf..6ef9433 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -3,8 +3,6 @@ struct task_struct; -int smpboot_prepare(unsigned int cpu); - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD struct task_struct *idle_thread_get(unsigned int cpu); void idle_thread_set_boot_cpu(void); diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f959..b73e681 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void) __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + unsigned long old_flags = current->flags; + + /* + * Mask out PF_MEMALLOC s current task context is borrowed for the + * softirq. A softirq handled such as network RX might set PF_MEMALLOC + * again if the socket is related to swap + */ + current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); account_system_vtime(current); @@ -265,6 +273,7 @@ restart: account_system_vtime(current); __local_bh_enable(SOFTIRQ_OFFSET); + tsk_restore_flags(current, old_flags, PF_MEMALLOC); } #ifndef __ARCH_HAS_DO_SOFTIRQ diff --git a/kernel/sys.c b/kernel/sys.c index f0ec44d..241507f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask) #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; @@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) down_write(&mm->mmap_sem); /* - * Forbid mm->exe_file change if there are mapped other files. + * Forbid mm->exe_file change if old file still mapped. */ err = -EBUSY; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_file && !path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_unlock; + if (mm->exe_file) { + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_file && + path_equal(&vma->vm_file->f_path, + &mm->exe_file->f_path)) + goto exit_unlock; } /* @@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) goto exit_unlock; + err = 0; set_mm_exe_file(mm, exe_file); exit_unlock: up_write(&mm->mmap_sem); @@ -2011,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; } me->pdeath_signal = arg2; - error = 0; break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); @@ -2025,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; } set_dumpable(me->mm, arg2); - error = 0; break; case PR_SET_UNALIGN: @@ -2052,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; - else - error = 0; break; - case PR_SET_NAME: comm[sizeof(me->comm)-1] = 0; if (strncpy_from_user(comm, (char __user *)arg2, @@ -2063,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EFAULT; set_task_comm(me, comm); proc_comm_connector(me); - return 0; + break; case PR_GET_NAME: get_task_comm(comm, me); if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) return -EFAULT; - return 0; + break; case PR_GET_ENDIAN: error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: error = SET_ENDIAN(me, arg2); break; - case PR_GET_SECCOMP: error = prctl_get_seccomp(); break; @@ -2104,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->default_timer_slack_ns; else current->timer_slack_ns = arg2; - error = 0; break; case PR_MCE_KILL: if (arg4 | arg5) @@ -2127,13 +2124,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); - break; default: return -EINVAL; } - error = 0; break; case PR_MCE_KILL_GET: if (arg2 | arg3 | arg4 | arg5) @@ -2147,9 +2140,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; - error = 0; break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, @@ -2191,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info) argv_free(info->argv); } -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) +static int __orderly_poweroff(void) { int argc; - char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + char **argv; static char *envp[] = { "HOME=/", "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL }; - int ret = -ENOMEM; + int ret; + argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); if (argv == NULL) { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", __func__, poweroff_cmd); - goto out; + return -ENOMEM; } ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, NULL, argv_cleanup, NULL); -out: - if (likely(!ret)) - return 0; - if (ret == -ENOMEM) argv_free(argv); - if (force) { + return ret; +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int ret = __orderly_poweroff(); + + if (ret && force) { printk(KERN_WARNING "Failed to start orderly shutdown: " "forcing the issue\n"); - /* I guess this should try to kick off some daemon to - sync and poweroff asap. Or not even bother syncing - if we're doing an emergency shutdown? */ + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ emergency_sync(); kernel_power_off(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ab1187..87174ef5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/ctype.h> #include <linux/kmemcheck.h> +#include <linux/kmemleak.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = { .data = core_pattern, .maxlen = CORENAME_MAX_SIZE, .mode = 0644, - .proc_handler = proc_dostring, + .proc_handler = proc_dostring_coredump, }, { .procname = "core_pipe_limit", @@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { - .procname = "nr_pdflush_threads", - .data = &nr_pdflush_threads, - .maxlen = sizeof nr_pdflush_threads, - .mode = 0444 /* read-only*/, - .proc_handler = proc_dointvec, + .procname = "nr_pdflush_threads", + .mode = 0444 /* read-only */, + .proc_handler = pdflush_proc_obsolete, }, { .procname = "swappiness", @@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = { #endif #endif { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_coredump, .extra1 = &zero, .extra2 = &two, }, @@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = { int __init sysctl_init(void) { - register_sysctl_table(sysctl_base_table); + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(sysctl_base_table); + kmemleak_not_leak(hdr); return 0; } @@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +static void validate_coredump_safety(void) +{ + if (suid_dumpable == SUID_DUMPABLE_SAFE && + core_pattern[0] != '/' && core_pattern[0] != '|') { + printk(KERN_WARNING "Unsafe core_pattern used with "\ + "suid_dumpable=2. Pipe handler or fully qualified "\ + "core dump path required.\n"); + } +} + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694..65bdcf1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = { { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, /* VM_PAGEBUF unused */ /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ diff --git a/kernel/task_work.c b/kernel/task_work.c index 82d1c79..91d4e17 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -3,82 +3,78 @@ #include <linux/tracehook.h> int -task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) { + struct callback_head *last, *first; unsigned long flags; - int err = -ESRCH; -#ifndef TIF_NOTIFY_RESUME - if (notify) - return -ENOTSUPP; -#endif /* - * We must not insert the new work if the task has already passed - * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() - * and check PF_EXITING under pi_lock. + * Not inserting the new work if the task has already passed + * exit_task_work() is the responisbility of callers. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - if (likely(!(task->flags & PF_EXITING))) { - hlist_add_head(&twork->hlist, &task->task_works); - err = 0; - } + last = task->task_works; + first = last ? last->next : twork; + twork->next = first; + if (last) + last->next = twork; + task->task_works = twork; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ - if (likely(!err) && notify) + if (notify) set_notify_resume(task); - return err; + return 0; } -struct task_work * +struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { unsigned long flags; - struct task_work *twork; - struct hlist_node *pos; + struct callback_head *last, *res = NULL; raw_spin_lock_irqsave(&task->pi_lock, flags); - hlist_for_each_entry(twork, pos, &task->task_works, hlist) { - if (twork->func == func) { - hlist_del(&twork->hlist); - goto found; + last = task->task_works; + if (last) { + struct callback_head *q = last, *p = q->next; + while (1) { + if (p->func == func) { + q->next = p->next; + if (p == last) + task->task_works = q == p ? NULL : q; + res = p; + break; + } + if (p == last) + break; + q = p; + p = q->next; } } - twork = NULL; - found: raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - return twork; + return res; } void task_work_run(void) { struct task_struct *task = current; - struct hlist_head task_works; - struct hlist_node *pos; + struct callback_head *p, *q; - raw_spin_lock_irq(&task->pi_lock); - hlist_move_list(&task->task_works, &task_works); - raw_spin_unlock_irq(&task->pi_lock); + while (1) { + raw_spin_lock_irq(&task->pi_lock); + p = task->task_works; + task->task_works = NULL; + raw_spin_unlock_irq(&task->pi_lock); - if (unlikely(hlist_empty(&task_works))) - return; - /* - * We use hlist to save the space in task_struct, but we want fifo. - * Find the last entry, the list should be short, then process them - * in reverse order. - */ - for (pos = task_works.first; pos->next; pos = pos->next) - ; + if (unlikely(!p)) + return; - for (;;) { - struct hlist_node **pprev = pos->pprev; - struct task_work *twork = container_of(pos, struct task_work, - hlist); - twork->func(twork); - - if (pprev == &task_works.first) - break; - pos = container_of(pprev, struct hlist_node, next); + q = p->next; /* head */ + p->next = NULL; /* cut it */ + while (q) { + p = q->next; + q->func(q); + q = p; + } } } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e660464..d0a3279 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); + if (na == NULL) { + rc = -EMSGSIZE; + goto err; + } + stats = nla_data(na); memset(stats, 0, sizeof(*stats)); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a470154..46da053 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33ab..24174b4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; @@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) time_state = TIME_DEL; break; case TIME_INS: - if (secs % 86400 == 0) { + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; time_tai++; @@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if ((secs + 1) % 86400 == 0) { + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { leap = 1; time_tai--; time_state = TIME_WAIT; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da70c6d..024540f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_enabled __read_mostly = 1; /* * Enable / Disable tickless mode @@ -271,49 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts) +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, now; + ktime_t last_update, expires, ret = { .tv64 = 0 }; + unsigned long rcu_delta_jiffies; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; - int cpu; - - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - - now = tick_nohz_start_idle(cpu, ts); - - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - } - - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - return; - if (need_resched()) - return; - - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } - return; - } - - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&xtime_lock); @@ -322,7 +288,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +296,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off @@ -392,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; + ret = expires; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -401,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); - ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; } - ts->idle_sleeps++; - - /* Mark expires */ - ts->idle_expires = expires; - /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. @@ -442,6 +409,65 @@ out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); + + return ret; +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + return false; + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + return true; +} + +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ + ktime_t now, expires; + int cpu = smp_processor_id(); + + now = tick_nohz_start_idle(cpu, ts); + + if (can_stop_idle_tick(cpu, ts)) { + int was_stopped = ts->tick_stopped; + + ts->idle_calls++; + + expires = tick_nohz_stop_sched_tick(ts, now, cpu); + if (expires.tv64 > 0LL) { + ts->idle_sleeps++; + ts->idle_expires = expires; + } + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; + } } /** @@ -479,7 +505,7 @@ void tick_nohz_idle_enter(void) * update of the idle time accounting in tick_nohz_start_idle(). */ ts->inidle = 1; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); local_irq_enable(); } @@ -499,7 +525,7 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); } /** @@ -517,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void) static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); while (1) { /* Forward the time to expire in the future */ @@ -540,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } } +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + select_nohz_load_balancer(0); + tick_do_update_jiffies64(now); + update_cpu_load_nohz(); + + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + unsigned long ticks; + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -551,9 +612,6 @@ void tick_nohz_idle_exit(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - unsigned long ticks; -#endif ktime_t now; local_irq_disable(); @@ -568,39 +626,11 @@ void tick_nohz_idle_exit(void) if (ts->idle_active) tick_nohz_stop_idle(cpu, now); - if (!ts->tick_stopped) { - local_irq_enable(); - return; + if (ts->tick_stopped) { + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); } - /* Update jiffies first */ - select_nohz_load_balancer(0); - tick_do_update_jiffies64(now); - update_cpu_load_nohz(); - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - /* - * We stopped the tick in idle. Update process times would miss the - * time we slept as update_process_times does only a 1 tick - * accounting. Enforce that this is accounted to idle ! - */ - ticks = jiffies - ts->idle_jiffies; - /* - * We might be one off. Do not randomly account a huge number of ticks! - */ - if (ticks && ticks < LONG_MAX) - account_idle_ticks(ticks); -#endif - - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); - local_irq_enable(); } @@ -804,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - ts->idle_jiffies++; + if (idle_cpu(cpu)) + ts->idle_jiffies++; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00..e16af19 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -24,32 +24,32 @@ /* Structure holding internal timekeeping values. */ struct timekeeper { /* Current clocksource used for timekeeping. */ - struct clocksource *clock; + struct clocksource *clock; /* NTP adjusted clock multiplier */ - u32 mult; + u32 mult; /* The shift value of the current clocksource. */ - int shift; - + u32 shift; /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; + cycle_t cycle_interval; /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; + u64 xtime_interval; /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; + s64 xtime_remainder; /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; + u32 raw_interval; + + /* Current CLOCK_REALTIME time in seconds */ + u64 xtime_sec; + /* Clock shifted nano seconds */ + u64 xtime_nsec; - /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ - u64 xtime_nsec; /* Difference between accumulated time and NTP time in ntp * shifted nano seconds. */ - s64 ntp_error; + s64 ntp_error; /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ - int ntp_error_shift; + u32 ntp_error_shift; - /* The current time */ - struct timespec xtime; /* * wall_to_monotonic is what we need to add to xtime (or xtime corrected * for sub jiffie times) to get to monotonic time. Monotonic is pegged @@ -64,14 +64,17 @@ struct timekeeper { * - wall_to_monotonic is no longer the boot time, getboottime must be * used instead. */ - struct timespec wall_to_monotonic; + struct timespec wall_to_monotonic; + /* Offset clock monotonic -> clock realtime */ + ktime_t offs_real; /* time spent in suspend */ - struct timespec total_sleep_time; + struct timespec total_sleep_time; + /* Offset clock monotonic -> clock boottime */ + ktime_t offs_boot; /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; - + struct timespec raw_time; /* Seqlock for all timekeeper values */ - seqlock_t lock; + seqlock_t lock; }; static struct timekeeper timekeeper; @@ -82,11 +85,62 @@ static struct timekeeper timekeeper; */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ + while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { + tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + tk->xtime_sec++; + } +} + +static struct timespec tk_xtime(struct timekeeper *tk) +{ + struct timespec ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); + return ts; +} +static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec = ts->tv_sec; + tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec += ts->tv_sec; + tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +{ + struct timespec tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + tk->wall_to_monotonic = wtm; + set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec_to_ktime(tmp); +} + +static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +{ + /* Verify consistency before modifying */ + WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + + tk->total_sleep_time = t; + tk->offs_boot = timespec_to_ktime(t); +} /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -98,12 +152,14 @@ int __read_mostly timekeeping_suspended; * * Unless you're the timekeeping code, you should not be using this! */ -static void timekeeper_setup_internals(struct clocksource *clock) +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { cycle_t interval; u64 tmp, ntpinterval; + struct clocksource *old_clock; - timekeeper.clock = clock; + old_clock = tk->clock; + tk->clock = clock; clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ @@ -116,74 +172,89 @@ static void timekeeper_setup_internals(struct clocksource *clock) tmp = 1; interval = (cycle_t) tmp; - timekeeper.cycle_interval = interval; + tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ - timekeeper.xtime_interval = (u64) interval * clock->mult; - timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; - timekeeper.raw_interval = + tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_remainder = ntpinterval - tk->xtime_interval; + tk->raw_interval = ((u64) interval * clock->mult) >> clock->shift; - timekeeper.xtime_nsec = 0; - timekeeper.shift = clock->shift; + /* if changing clocks, convert xtime_nsec shift units */ + if (old_clock) { + int shift_change = clock->shift - old_clock->shift; + if (shift_change < 0) + tk->xtime_nsec >>= -shift_change; + else + tk->xtime_nsec <<= shift_change; + } + tk->shift = clock->shift; - timekeeper.ntp_error = 0; - timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_error = 0; + tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - timekeeper.mult = clock->mult; + tk->mult = clock->mult; } /* Timekeeper helper functions. */ -static inline s64 timekeeping_get_ns(void) +static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds using ntp adjusted mult. */ - return clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + nsec = cycle_delta * tk->mult + tk->xtime_nsec; + nsec >>= tk->shift; + + /* If arch requires, add in gettimeoffset() */ + return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns_raw(void) +static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds. */ - return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + /* convert delta to nanoseconds. */ + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + + /* If arch requires, add in gettimeoffset() */ + return nsec + arch_gettimeoffset(); } /* must hold write on timekeeper.lock */ -static void timekeeping_update(bool clearntp) +static void timekeeping_update(struct timekeeper *tk, bool clearntp) { + struct timespec xt; + if (clearntp) { - timekeeper.ntp_error = 0; + tk->ntp_error = 0; ntp_clear(); } - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); + xt = tk_xtime(tk); + update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } - /** * timekeeping_forward_now - update clock to the current time * @@ -191,27 +262,26 @@ static void timekeeping_update(bool clearntp) * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward_now(void) +static void timekeeping_forward_now(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; s64 nsec; - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - nsec += arch_gettimeoffset(); + tk->xtime_nsec += arch_gettimeoffset() << tk->shift; - timespec_add_ns(&timekeeper.xtime, nsec); + tk_normalize_xtime(tk); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&timekeeper.raw_time, nsec); + timespec_add_ns(&tk->raw_time, nsec); } /** @@ -222,21 +292,19 @@ static void timekeeping_forward_now(void) */ void getnstimeofday(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; - s64 nsecs; + s64 nsecs = 0; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - - *ts = timekeeper.xtime; - nsecs = timekeeping_get_ns(); + seq = read_seqbegin(&tk->lock); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts, nsecs); } @@ -244,22 +312,18 @@ EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { + struct timekeeper *tk = &timekeeper; unsigned int seq; s64 secs, nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime.tv_sec + - timekeeper.wall_to_monotonic.tv_sec; - nsecs = timekeeper.xtime.tv_nsec + - timekeeper.wall_to_monotonic.tv_nsec; - nsecs += timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); - - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; + + } while (read_seqretry(&tk->lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -278,24 +342,22 @@ EXPORT_SYMBOL_GPL(ktime_get); */ void ktime_get_ts(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono; unsigned int seq; - s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; - tomono = timekeeper.wall_to_monotonic; - nsecs = timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec + nsecs); + ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -312,28 +374,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - u32 arch_offset; + seq = read_seqbegin(&tk->lock); - seq = read_seqbegin(&timekeeper.lock); + *ts_raw = tk->raw_time; + ts_real->tv_sec = tk->xtime_sec; + ts_real->tv_nsec = 0; - *ts_raw = timekeeper.raw_time; - *ts_real = timekeeper.xtime; + nsecs_raw = timekeeping_get_ns_raw(tk); + nsecs_real = timekeeping_get_ns(tk); - nsecs_raw = timekeeping_get_ns_raw(); - nsecs_real = timekeeping_get_ns(); - - /* If arch requires, add in gettimeoffset() */ - arch_offset = arch_gettimeoffset(); - nsecs_raw += arch_offset; - nsecs_real += arch_offset; - - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -366,25 +423,28 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { - struct timespec ts_delta; + struct timekeeper *tk = &timekeeper; + struct timespec ts_delta, xt; unsigned long flags; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); + + timekeeping_forward_now(tk); - timekeeping_forward_now(); + xt = tk_xtime(tk); + ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, ts_delta); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); - timekeeper.xtime = *tv; - timekeeping_update(true); + tk_set_xtime(tk, tv); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + timekeeping_update(tk, true); + + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -393,7 +453,6 @@ int do_settimeofday(const struct timespec *tv) } EXPORT_SYMBOL(do_settimeofday); - /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset @@ -402,22 +461,23 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long flags; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(tk); - timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, *ts); - timekeeping_update(true); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + timekeeping_update(tk, true); + + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -433,23 +493,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ static int change_clocksource(void *data) { + struct timekeeper *tk = &timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { - old = timekeeper.clock; - timekeeper_setup_internals(new); + old = tk->clock; + tk_setup_internals(tk, new); if (old->disable) old->disable(old); } - timekeeping_update(true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); return 0; } @@ -463,7 +524,9 @@ static int change_clocksource(void *data) */ void timekeeping_notify(struct clocksource *clock) { - if (timekeeper.clock == clock) + struct timekeeper *tk = &timekeeper; + + if (tk->clock == clock) return; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); @@ -492,35 +555,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs; do { - seq = read_seqbegin(&timekeeper.lock); - nsecs = timekeeping_get_ns_raw(); - *ts = timekeeper.raw_time; + seq = read_seqbegin(&tk->lock); + nsecs = timekeeping_get_ns_raw(tk); + *ts = tk->raw_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic); - /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; int ret; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -530,14 +594,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; u64 ret; + do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->max_idle_ns; + ret = tk->clock->max_idle_ns; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -577,36 +643,38 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { + struct timekeeper *tk = &timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot; + struct timespec now, boot, tmp; read_persistent_clock(&now); read_boot_clock(&boot); - seqlock_init(&timekeeper.lock); + seqlock_init(&tk->lock); ntp_init(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - timekeeper_setup_internals(clock); - - timekeeper.xtime.tv_sec = now.tv_sec; - timekeeper.xtime.tv_nsec = now.tv_nsec; - timekeeper.raw_time.tv_sec = 0; - timekeeper.raw_time.tv_nsec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) { - boot.tv_sec = timekeeper.xtime.tv_sec; - boot.tv_nsec = timekeeper.xtime.tv_nsec; - } - set_normalized_timespec(&timekeeper.wall_to_monotonic, - -boot.tv_sec, -boot.tv_nsec); - timekeeper.total_sleep_time.tv_sec = 0; - timekeeper.total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&timekeeper.lock, flags); + tk_setup_internals(tk, clock); + + tk_set_xtime(tk, &now); + tk->raw_time.tv_sec = 0; + tk->raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) + boot = tk_xtime(tk); + + set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + tk_set_wall_to_mono(tk, tmp); + + tmp.tv_sec = 0; + tmp.tv_nsec = 0; + tk_set_sleep_time(tk, tmp); + + write_sequnlock_irqrestore(&tk->lock, flags); } /* time in seconds when suspend began */ @@ -619,22 +687,19 @@ static struct timespec timekeeping_suspend_time; * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ -static void __timekeeping_inject_sleeptime(struct timespec *delta) +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, + struct timespec *delta) { if (!timespec_valid(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } - - timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, *delta); - timekeeper.total_sleep_time = timespec_add( - timekeeper.total_sleep_time, *delta); + tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); + tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); } - /** * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec delta value @@ -647,6 +712,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) */ void timekeeping_inject_sleeptime(struct timespec *delta) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -655,21 +721,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) return; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(tk); - __timekeeping_inject_sleeptime(delta); + __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); } - /** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @@ -679,6 +744,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -686,17 +752,18 @@ static void timekeeping_resume(void) clocksource_resume(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(&ts); + __timekeeping_inject_sleeptime(tk, &ts); } /* re-base the last cycle value */ - timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); - timekeeper.ntp_error = 0; + tk->clock->cycle_last = tk->clock->read(tk->clock); + tk->ntp_error = 0; timekeeping_suspended = 0; - write_sequnlock_irqrestore(&timekeeper.lock, flags); + timekeeping_update(tk, false); + write_sequnlock_irqrestore(&tk->lock, flags); touch_softlockup_watchdog(); @@ -708,14 +775,15 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec delta, delta_delta; static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + write_seqlock_irqsave(&tk->lock, flags); + timekeeping_forward_now(tk); timekeeping_suspended = 1; /* @@ -724,7 +792,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); + delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -737,7 +805,7 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -763,7 +831,8 @@ device_initcall(timekeeping_init_ops); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, + s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -779,7 +848,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -788,8 +857,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); - tick_error -= timekeeper.xtime_interval >> 1; + tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); + tick_error -= tk->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -814,9 +883,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void timekeeping_adjust(s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - s64 error, interval = timekeeper.cycle_interval; + s64 error, interval = tk->cycle_interval; int adj; /* @@ -832,7 +901,7 @@ static void timekeeping_adjust(s64 offset) * * Note: It does not "save" on aggravation when reading the code. */ - error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); + error = tk->ntp_error >> (tk->ntp_error_shift - 1); if (error > interval) { /* * We now divide error by 4(via shift), which checks if @@ -847,34 +916,36 @@ static void timekeeping_adjust(s64 offset) * the error. This causes the likely below to be unlikely. * * The proper fix is to avoid rounding up by using - * the high precision timekeeper.xtime_nsec instead of + * the high precision tk->xtime_nsec instead of * xtime.tv_nsec everywhere. Fixing this will take some * time. */ if (likely(error <= interval)) adj = 1; else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else /* No adjustment needed */ - return; + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } else { + if (error < -interval) { + /* See comment above, this is just switched for the negative */ + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else { + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } + } else { + goto out_adjust; + } + } - if (unlikely(timekeeper.clock->maxadj && - (timekeeper.mult + adj > - timekeeper.clock->mult + timekeeper.clock->maxadj))) { + if (unlikely(tk->clock->maxadj && + (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", - timekeeper.clock->name, (long)timekeeper.mult + adj, - (long)timekeeper.clock->mult + - timekeeper.clock->maxadj); + tk->clock->name, (long)tk->mult + adj, + (long)tk->clock->mult + tk->clock->maxadj); } /* * So the following can be confusing. @@ -925,13 +996,68 @@ static void timekeeping_adjust(s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - timekeeper.mult += adj; - timekeeper.xtime_interval += interval; - timekeeper.xtime_nsec -= offset; - timekeeper.ntp_error -= (interval - offset) << - timekeeper.ntp_error_shift; + tk->mult += adj; + tk->xtime_interval += interval; + tk->xtime_nsec -= offset; + tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; + +out_adjust: + /* + * It may be possible that when we entered this function, xtime_nsec + * was very small. Further, if we're slightly speeding the clocksource + * in the code above, its possible the required corrective factor to + * xtime_nsec could cause it to underflow. + * + * Now, since we already accumulated the second, cannot simply roll + * the accumulated second back, since the NTP subsystem has been + * notified via second_overflow. So instead we push xtime_nsec forward + * by the amount we underflowed, and add that amount into the error. + * + * We'll correct this error next time through this function, when + * xtime_nsec is not as small. + */ + if (unlikely((s64)tk->xtime_nsec < 0)) { + s64 neg = -(s64)tk->xtime_nsec; + tk->xtime_nsec = 0; + tk->ntp_error += neg << tk->ntp_error_shift; + } + } +/** + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates a the nsecs greater then a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + * + */ +static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +{ + u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + + while (tk->xtime_nsec >= nsecps) { + int leap; + + tk->xtime_nsec -= nsecps; + tk->xtime_sec++; + + /* Figure out if its a leap sec and apply if needed */ + leap = second_overflow(tk->xtime_sec); + if (unlikely(leap)) { + struct timespec ts; + + tk->xtime_sec += leap; + + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec_sub(tk->wall_to_monotonic, ts)); + + clock_was_set_delayed(); + } + } +} /** * logarithmic_accumulation - shifted accumulation of cycles @@ -942,49 +1068,40 @@ static void timekeeping_adjust(s64 offset) * * Returns the unconsumed cycles. */ -static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, + u32 shift) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; u64 raw_nsecs; - /* If the offset is smaller than a shifted interval, do nothing */ - if (offset < timekeeper.cycle_interval<<shift) + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < tk->cycle_interval<<shift) return offset; /* Accumulate one shifted interval */ - offset -= timekeeper.cycle_interval << shift; - timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; + offset -= tk->cycle_interval << shift; + tk->clock->cycle_last += tk->cycle_interval << shift; - timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; - while (timekeeper.xtime_nsec >= nsecps) { - int leap; - timekeeper.xtime_nsec -= nsecps; - timekeeper.xtime.tv_sec++; - leap = second_overflow(timekeeper.xtime.tv_sec); - timekeeper.xtime.tv_sec += leap; - timekeeper.wall_to_monotonic.tv_sec -= leap; - } + tk->xtime_nsec += tk->xtime_interval << shift; + accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = timekeeper.raw_interval << shift; - raw_nsecs += timekeeper.raw_time.tv_nsec; + raw_nsecs = tk->raw_interval << shift; + raw_nsecs += tk->raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - timekeeper.raw_time.tv_sec += raw_secs; + tk->raw_time.tv_sec += raw_secs; } - timekeeper.raw_time.tv_nsec = raw_nsecs; + tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - timekeeper.ntp_error += ntp_tick_length() << shift; - timekeeper.ntp_error -= - (timekeeper.xtime_interval + timekeeper.xtime_remainder) << - (timekeeper.ntp_error_shift + shift); + tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << + (tk->ntp_error_shift + shift); return offset; } - /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -992,25 +1109,25 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) static void update_wall_time(void) { struct clocksource *clock; + struct timekeeper *tk = &timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; + s64 remainder; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; - clock = timekeeper.clock; + clock = tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = timekeeper.cycle_interval; + offset = tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif - timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << - timekeeper.shift; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -1020,71 +1137,45 @@ static void update_wall_time(void) * chunk in one go, and then try to consume the next smaller * doubled multiple. */ - shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); - while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(offset, shift); - if(offset < timekeeper.cycle_interval<<shift) + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift); + if (offset < tk->cycle_interval<<shift) shift--; } /* correct the clock when NTP error is too big */ - timekeeping_adjust(offset); - - /* - * Since in the loop above, we accumulate any amount of time - * in xtime_nsec over a second into xtime.tv_sec, its possible for - * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in timekeeping_adjust(), - * its possible the required corrective factor to xtime_nsec could - * cause it to underflow. - * - * Now, we cannot simply roll the accumulated second back, since - * the NTP subsystem has been notified via second_overflow. So - * instead we push xtime_nsec forward by the amount we underflowed, - * and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. - */ - if (unlikely((s64)timekeeper.xtime_nsec < 0)) { - s64 neg = -(s64)timekeeper.xtime_nsec; - timekeeper.xtime_nsec = 0; - timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; - } + timekeeping_adjust(tk, offset); /* - * Store full nanoseconds into xtime after rounding it up and - * add the remainder to the error difference. - */ - timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> - timekeeper.shift) + 1; - timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << - timekeeper.shift; - timekeeper.ntp_error += timekeeper.xtime_nsec << - timekeeper.ntp_error_shift; + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), this can be killed. + */ + remainder = tk->xtime_nsec & ((1 << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1 << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; /* * Finally, make sure that after the rounding - * xtime.tv_nsec isn't larger than NSEC_PER_SEC + * xtime_nsec isn't larger than NSEC_PER_SEC */ - if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { - int leap; - timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; - timekeeper.xtime.tv_sec++; - leap = second_overflow(timekeeper.xtime.tv_sec); - timekeeper.xtime.tv_sec += leap; - timekeeper.wall_to_monotonic.tv_sec -= leap; - } + accumulate_nsecs_to_secs(tk); - timekeeping_update(false); + timekeeping_update(tk, false); out: - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); } @@ -1101,18 +1192,18 @@ out: */ void getboottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec boottime = { - .tv_sec = timekeeper.wall_to_monotonic.tv_sec + - timekeeper.total_sleep_time.tv_sec, - .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + - timekeeper.total_sleep_time.tv_nsec + .tv_sec = tk->wall_to_monotonic.tv_sec + + tk->total_sleep_time.tv_sec, + .tv_nsec = tk->wall_to_monotonic.tv_nsec + + tk->total_sleep_time.tv_nsec }; set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } EXPORT_SYMBOL_GPL(getboottime); - /** * get_monotonic_boottime - Returns monotonic time since boot * @ts: pointer to the timespec to be set @@ -1124,23 +1215,23 @@ EXPORT_SYMBOL_GPL(getboottime); */ void get_monotonic_boottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono, sleep; unsigned int seq; - s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; - tomono = timekeeper.wall_to_monotonic; - sleep = timekeeper.total_sleep_time; - nsecs = timekeeping_get_ns(); + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; + sleep = tk->total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); @@ -1167,31 +1258,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add(*ts, timekeeper.total_sleep_time); + struct timekeeper *tk = &timekeeper; + + *ts = timespec_add(*ts, tk->total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return timekeeper.xtime.tv_sec; + struct timekeeper *tk = &timekeeper; + + return tk->xtime_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return timekeeper.xtime; + struct timekeeper *tk = &timekeeper; + + return tk_xtime(tk); } struct timespec current_kernel_time(void) { + struct timekeeper *tk = &timekeeper; struct timespec now; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = timekeeper.xtime; - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + } while (read_seqretry(&tk->lock, seq)); return now; } @@ -1199,15 +1297,16 @@ EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { + struct timekeeper *tk = &timekeeper; struct timespec now, mono; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = timekeeper.xtime; - mono = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1236,34 +1335,67 @@ void do_timer(unsigned long ticks) void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, struct timespec *wtom, struct timespec *sleep) { + struct timekeeper *tk = &timekeeper; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); - *xtim = timekeeper.xtime; - *wtom = timekeeper.wall_to_monotonic; - *sleep = timekeeper.total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + *xtim = tk_xtime(tk); + *wtom = tk->wall_to_monotonic; + *sleep = tk->total_sleep_time; + } while (read_seqretry(&tk->lock, seq)); } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +{ + struct timekeeper *tk = &timekeeper; + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqbegin(&tk->lock); + + secs = tk->xtime_sec; + nsecs = timekeeping_get_ns(tk); + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + } while (read_seqretry(&tk->lock, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *offs_real); + return now; +} +#endif + /** * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format */ ktime_t ktime_get_monotonic_offset(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; struct timespec wtom; do { - seq = read_seqbegin(&timekeeper.lock); - wtom = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + wtom = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); return timespec_to_ktime(wtom); } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - /** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3258455..af5a7e9 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); - P_ns(idle_tick); + P_ns(last_tick); P(tick_stopped); P(idle_jiffies); P(idle_calls); @@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.6\n"); + SEQ_printf(m, "Timer List Version: v0.7\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 6ec7e7e..a61c093 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -77,6 +77,7 @@ struct tvec_base { struct timer_list *running_timer; unsigned long timer_jiffies; unsigned long next_timer; + unsigned long active_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; @@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + /* + * Update base->active_timers and base->next_timer + */ + if (!tbase_get_deferrable(timer->base)) { + if (time_before(timer->expires, base->next_timer)) + base->next_timer = timer->expires; + base->active_timers++; + } +} + #ifdef CONFIG_TIMER_STATS void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) { @@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer, } EXPORT_SYMBOL(init_timer_deferrable_key); -static inline void detach_timer(struct timer_list *timer, - int clear_pending) +static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct list_head *entry = &timer->entry; @@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer, entry->prev = LIST_POISON2; } +static inline void +detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +{ + detach_timer(timer, true); + if (!tbase_get_deferrable(timer->base)) + timer->base->active_timers--; +} + +static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, + bool clear_pending) +{ + if (!timer_pending(timer)) + return 0; + + detach_timer(timer, clear_pending); + if (!tbase_get_deferrable(timer->base)) { + timer->base->active_timers--; + if (timer->expires == base->next_timer) + base->next_timer = base->timer_jiffies; + } + return 1; +} + /* * We are using hashed locking: holding per_cpu(tvec_bases).lock * means that all timers which are tied to this base via timer->base are @@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 0); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } else { - if (pending_only) - goto out_unlock; - } + ret = detach_if_pending(timer, base, false); + if (!ret && pending_only) + goto out_unlock; debug_activate(timer, expires); @@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); debug_activate(timer, timer->expires); - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer) timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } + ret = detach_if_pending(timer, base, true); spin_unlock_irqrestore(&base->lock, flags); } @@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer) base = lock_timer_base(timer, &flags); - if (base->running_timer == timer) - goto out; - - timer_stats_timer_clear_start_info(timer); - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; + if (base->running_timer != timer) { + timer_stats_timer_clear_start_info(timer); + ret = detach_if_pending(timer, base, true); } -out: spin_unlock_irqrestore(&base->lock, flags); return ret; @@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { BUG_ON(tbase_get_base(timer->base) != base); - internal_add_timer(base, timer); + /* No accounting, while moving them */ + __internal_add_timer(base, timer); } return index; @@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); base->running_timer = timer; - detach_timer(timer, 1); + detach_expired_timer(timer, base); spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); @@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, unsigned long get_next_timer_interrupt(unsigned long now) { struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires; + unsigned long expires = now + NEXT_TIMER_MAX_DELTA; /* * Pretend that there is no timer pending if the cpu is offline. * Possible pending timers will be migrated later to an active cpu. */ if (cpu_is_offline(smp_processor_id())) - return now + NEXT_TIMER_MAX_DELTA; + return expires; + spin_lock(&base->lock); - if (time_before_eq(base->next_timer, base->timer_jiffies)) - base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + if (base->active_timers) { + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; + } spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; + base->active_timers = 0; return 0; } @@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea while (!list_empty(head)) { timer = list_first_entry(head, struct timer_list, entry); - detach_timer(timer, 0); + /* We ignore the accounting on the dying cpu */ + detach_timer(timer, false); timer_set_base(timer, new_base); - if (time_before(timer->expires, new_base->next_timer) && - !tbase_get_deferrable(timer->base)) - new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663..b4f20fb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (ftrace_disabled) + if (unlikely(ftrace_disabled)) return -ENODEV; if (FTRACE_WARN_ON(ops == &global_ops)) @@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) - goto out_unlock; - ret = __register_ftrace_function(ops); if (!ret) ret = ftrace_startup(ops, 0); - - out_unlock: mutex_unlock(&ftrace_lock); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8..49491fa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) @@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. - * No need to update overruns, since this page is - * deleted from ring buffer and its entries are - * already accounted for. + * Increment overrun to account for the lost events. */ + local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); } @@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + /* * Reset the reader page to size zero. */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6..5c38c81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); void tracing_off(void) { if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + ring_buffer_record_off(global_trace.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -830,6 +830,8 @@ int register_tracer(struct tracer *type) current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ @@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); + iter->idx++; - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + if (buf_iter) + ring_buffer_read(buf_iter, NULL); } static struct trace_entry * @@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); @@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) tr->data[cpu]->skipped_entries = 0; - if (!iter->buffer_iter[cpu]) + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) return; - buf_iter = iter->buffer_iter[cpu]; ring_buffer_iter_reset(buf_iter); /* @@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) int trace_empty(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { cpu = iter->cpu_file; - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter) } for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file) if (!iter) return ERR_PTR(-ENOMEM); + iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + GFP_KERNEL); + if (!iter->buffer_iter) + goto release; + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file) fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); + kfree(iter->buffer_iter); +release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } @@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return 0; } @@ -3172,10 +3187,10 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = t; + current_trace = &nop_trace; - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { + topts = create_trace_option_files(t); + if (t->use_max_tr) { int cpu; /* we need to make per cpu buffer sizes equivalent */ for_each_tracing_cpu(cpu) { @@ -3195,6 +3210,7 @@ static int tracing_set_tracer(const char *buf) goto out; } + current_trace = t; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -3609,6 +3625,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, @@ -3680,7 +3697,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = splice_to_pipe(pipe, &spd); out: - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; out_err: @@ -4231,6 +4248,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -4318,7 +4336,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); out: return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5aec220..55e1f7f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -317,6 +317,14 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + if (iter->buffer_iter && iter->buffer_iter[cpu]) + return iter->buffer_iter[cpu]; + return NULL; +} + int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752..8a6d2ee 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, - 1, ®s, head); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a..a426f41 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,6 +13,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/pstore.h> #include <linux/fs.h> #include "trace.h" @@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_PSTORE = 0x2, +}; + +static struct tracer_flags func_flags; + static void function_trace_call(unsigned long ip, unsigned long parent_ip) { @@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { + /* + * So far tracing doesn't support multiple buffers, so + * we make an explicit call for now. + */ + if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) + pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly = .flags = FTRACE_OPS_FL_GLOBAL, }; -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif +#ifdef CONFIG_PSTORE_FTRACE + { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, +#endif { } /* Always set a last empty entry */ }; @@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void) static int func_set_flag(u32 old_flags, u32 bit, int set) { - if (bit == TRACE_FUNC_OPT_STACK) { + switch (bit) { + case TRACE_FUNC_OPT_STACK: /* do nothing if already set */ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; + break; if (set) { unregister_ftrace_function(&trace_ops); @@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } - return 0; + break; + case TRACE_FUNC_OPT_PSTORE: + break; + default: + return -EINVAL; } - return -EINVAL; + return 0; } static struct tracer function_trace __read_mostly = diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c..ce27c8b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter, next = &data->ret; } else { - ring_iter = iter->buffer_iter[iter->cpu]; + ring_iter = trace_buffer_iter(iter, iter->cpu); /* First peek to compare current entry and the next one */ if (ring_iter) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b31d3d5..1a21170 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ip, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ret_ip, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df611a0..123b189 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1325,4 +1325,4 @@ __init static int init_events(void) return 0; } -device_initcall(init_events); +early_initcall(init_events); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 96fc733..60e4d78 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysexit_enable(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2b36ac6..03003cd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); out: preempt_enable(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85..4b1dfba 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -372,6 +372,13 @@ static int watchdog(void *unused) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + static int watchdog_nmi_enable(int cpu) { struct perf_event_attr *wd_attr; @@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + if (!IS_ERR(event)) { - pr_info("enabled, takes one hw-pmu counter.\n"); + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a3128d..692d976 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -45,32 +45,41 @@ #include "workqueue_sched.h" enum { - /* global_cwq flags */ - GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ - GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - GCWQ_FREEZING = 1 << 3, /* freeze in progress */ - GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + /* + * global_cwq flags + * + * A bound gcwq is either associated or disassociated with its CPU. + * While associated (!DISASSOCIATED), all workers are bound to the + * CPU and none has %WORKER_UNBOUND set and concurrency management + * is in effect. + * + * While DISASSOCIATED, the cpu may be offline and all workers have + * %WORKER_UNBOUND set and concurrency management disabled, and may + * be executing on any CPU. The gcwq behaves as an unbound one. + * + * Note that DISASSOCIATED can be flipped only while holding + * managership of all pools on the gcwq to avoid changing binding + * state while create_worker() is in progress. + */ + GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 1, /* freeze in progress */ + + /* pool flags */ + POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | - WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | + WORKER_CPU_INTENSIVE, - /* gcwq->trustee_state */ - TRUSTEE_START = 0, /* start */ - TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ - TRUSTEE_BUTCHER = 2, /* butcher workers */ - TRUSTEE_RELEASE = 3, /* release workers */ - TRUSTEE_DONE = 4, /* trustee is done */ + NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, @@ -84,13 +93,13 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ - TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by * all cpus. Give -20. */ RESCUER_NICE_LEVEL = -20, + HIGHPRI_NICE_LEVEL = -20, }; /* @@ -115,6 +124,8 @@ enum { */ struct global_cwq; +struct worker_pool; +struct idle_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -131,12 +142,31 @@ struct worker { struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - struct work_struct rebind_work; /* L: rebind worker to cpu */ + + /* for rebinding worker to CPU */ + struct idle_rebind *idle_rebind; /* L: for idle worker */ + struct work_struct rebind_work; /* L: for busy worker */ +}; + +struct worker_pool { + struct global_cwq *gcwq; /* I: the owning gcwq */ + unsigned int flags; /* X: flags */ + + struct list_head worklist; /* L: list of pending works */ + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + struct list_head idle_list; /* X: list of idle workers */ + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for workers */ + + struct mutex manager_mutex; /* mutex manager should hold */ + struct ida worker_ida; /* L: for worker IDs */ }; /* @@ -146,27 +176,16 @@ struct worker { */ struct global_cwq { spinlock_t lock; /* the gcwq lock */ - struct list_head worklist; /* L: list of pending works */ unsigned int cpu; /* I: the associated cpu */ unsigned int flags; /* L: GCWQ_* flags */ - int nr_workers; /* L: total number of workers */ - int nr_idle; /* L: currently idle ones */ - - /* workers are chained either in the idle_list or busy_hash */ - struct list_head idle_list; /* X: list of idle workers */ + /* workers are chained either in busy_hash or pool idle_list */ struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for dworkers */ - - struct ida worker_ida; /* L: for worker IDs */ + struct worker_pool pools[2]; /* normal and highpri pools */ - struct task_struct *trustee; /* L: for gcwq shutdown */ - unsigned int trustee_state; /* L: trustee state */ - wait_queue_head_t trustee_wait; /* trustee wait */ - struct worker *first_idle; /* L: first idle worker */ + wait_queue_head_t rebind_hold; /* rebind hold wait */ } ____cacheline_aligned_in_smp; /* @@ -175,7 +194,7 @@ struct global_cwq { * aligned at two's power of the number of flag bits. */ struct cpu_workqueue_struct { - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ @@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#define for_each_worker_pool(pool, gcwq) \ + for ((pool) = &(gcwq)->pools[0]; \ + (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) + #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) @@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ * try_to_wake_up(). Put it in a separate cacheline. */ static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); /* * Global cpu workqueue and nr_running counter for unbound gcwq. The @@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); * workers have WORKER_UNBOUND set. */ static struct global_cwq unbound_global_cwq; -static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ +static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { + [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ +}; static int worker_thread(void *__worker); +static int worker_pool_pri(struct worker_pool *pool) +{ + return pool - pool->gcwq->pools; +} + static struct global_cwq *get_gcwq(unsigned int cpu) { if (cpu != WORK_CPU_UNBOUND) @@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu) return &unbound_global_cwq; } -static atomic_t *get_gcwq_nr_running(unsigned int cpu) +static atomic_t *get_pool_nr_running(struct worker_pool *pool) { + int cpu = pool->gcwq->cpu; + int idx = worker_pool_pri(pool); + if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(gcwq_nr_running, cpu); + return &per_cpu(pool_nr_running, cpu)[idx]; else - return &unbound_gcwq_nr_running; + return &unbound_pool_nr_running[idx]; } static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, @@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) if (data & WORK_STRUCT_CWQ) return ((struct cpu_workqueue_struct *) - (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; cpu = data >> WORK_STRUCT_FLAG_BITS; if (cpu == WORK_CPU_NONE) @@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) } /* - * Policy functions. These define the policies on how the global - * worker pool is managed. Unless noted otherwise, these functions - * assume that they're being called with gcwq->lock held. + * Policy functions. These define the policies on how the global worker + * pools are managed. Unless noted otherwise, these functions assume that + * they're being called with gcwq->lock held. */ -static bool __need_more_worker(struct global_cwq *gcwq) +static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || - gcwq->flags & GCWQ_HIGHPRI_PENDING; + return !atomic_read(get_pool_nr_running(pool)); } /* * Need to wake up a worker? Called from anything but currently * running workers. + * + * Note that, because unbound workers never contribute to nr_running, this + * function will always return %true for unbound gcwq as long as the + * worklist isn't empty. */ -static bool need_more_worker(struct global_cwq *gcwq) +static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); + return !list_empty(&pool->worklist) && __need_more_worker(pool); } /* Can I start working? Called from busy but !running workers. */ -static bool may_start_working(struct global_cwq *gcwq) +static bool may_start_working(struct worker_pool *pool) { - return gcwq->nr_idle; + return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ -static bool keep_working(struct global_cwq *gcwq) +static bool keep_working(struct worker_pool *pool) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); - return !list_empty(&gcwq->worklist) && - (atomic_read(nr_running) <= 1 || - gcwq->flags & GCWQ_HIGHPRI_PENDING); + return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; } /* Do we need a new worker? Called from manager. */ -static bool need_to_create_worker(struct global_cwq *gcwq) +static bool need_to_create_worker(struct worker_pool *pool) { - return need_more_worker(gcwq) && !may_start_working(gcwq); + return need_more_worker(pool) && !may_start_working(pool); } /* Do I need to be the manager? */ -static bool need_to_manage_workers(struct global_cwq *gcwq) +static bool need_to_manage_workers(struct worker_pool *pool) { - return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; + return need_to_create_worker(pool) || + (pool->flags & POOL_MANAGE_WORKERS); } /* Do we have too many workers and should some go away? */ -static bool too_many_workers(struct global_cwq *gcwq) +static bool too_many_workers(struct worker_pool *pool) { - bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; - int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ - int nr_busy = gcwq->nr_workers - nr_idle; + bool managing = mutex_is_locked(&pool->manager_mutex); + int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ + int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq) */ /* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct global_cwq *gcwq) +static struct worker *first_worker(struct worker_pool *pool) { - if (unlikely(list_empty(&gcwq->idle_list))) + if (unlikely(list_empty(&pool->idle_list))) return NULL; - return list_first_entry(&gcwq->idle_list, struct worker, entry); + return list_first_entry(&pool->idle_list, struct worker, entry); } /** * wake_up_worker - wake up an idle worker - * @gcwq: gcwq to wake worker for + * @pool: worker pool to wake worker from * - * Wake up the first idle worker of @gcwq. + * Wake up the first idle worker of @pool. * * CONTEXT: * spin_lock_irq(gcwq->lock). */ -static void wake_up_worker(struct global_cwq *gcwq) +static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(gcwq); + struct worker *worker = first_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) struct worker *worker = kthread_data(task); if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(cpu)); + atomic_inc(get_pool_nr_running(worker->pool)); } /** @@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct global_cwq *gcwq = get_gcwq(cpu); - atomic_t *nr_running = get_gcwq_nr_running(cpu); + struct worker_pool *pool = worker->pool; + atomic_t *nr_running = get_pool_nr_running(pool); if (worker->flags & WORKER_NOT_RUNNING) return NULL; @@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * worklist not empty test sequence is in insert_work(). * Please read comment there. * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. + * NOT_RUNNING is clear. This means that we're bound to and + * running on the local cpu w/ rq lock held and preemption + * disabled, which in turn means that none else could be + * manipulating idle_list, so dereferencing idle_list without gcwq + * lock is safe. */ - if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) - to_wakeup = first_worker(gcwq); + if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) + to_wakeup = first_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, static inline void worker_set_flags(struct worker *worker, unsigned int flags, bool wakeup) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; WARN_ON_ONCE(worker->task != current); @@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); if (wakeup) { if (atomic_dec_and_test(nr_running) && - !list_empty(&gcwq->worklist)) - wake_up_worker(gcwq); + !list_empty(&pool->worklist)) + wake_up_worker(pool); } else atomic_dec(nr_running); } @@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; WARN_ON_ONCE(worker->task != current); @@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(gcwq->cpu)); + atomic_inc(get_pool_nr_running(pool)); } /** @@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, } /** - * gcwq_determine_ins_pos - find insertion position - * @gcwq: gcwq of interest - * @cwq: cwq a work is being queued for - * - * A work for @cwq is about to be queued on @gcwq, determine insertion - * position for the work. If @cwq is for HIGHPRI wq, the work is - * queued at the head of the queue but in FIFO order with respect to - * other HIGHPRI works; otherwise, at the end of the queue. This - * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that - * there are HIGHPRI works pending. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to inserstion position. - */ -static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, - struct cpu_workqueue_struct *cwq) -{ - struct work_struct *twork; - - if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) - return &gcwq->worklist; - - list_for_each_entry(twork, &gcwq->worklist, entry) { - struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); - - if (!(tcwq->wq->flags & WQ_HIGHPRI)) - break; - } - - gcwq->flags |= GCWQ_HIGHPRI_PENDING; - return &twork->entry; -} - -/** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to * @work: work to insert @@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = cwq->pool; /* we own @work, set data and link */ set_work_cwq(work, cwq, extra_flags); @@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq, */ smp_mb(); - if (__need_more_worker(gcwq)) - wake_up_worker(gcwq); + if (__need_more_worker(pool)) + wake_up_worker(pool); } /* @@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (likely(cwq->nr_active < cwq->max_active)) { trace_workqueue_activate_work(work); cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); + worklist = &cwq->pool->worklist; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &cwq->delayed_works; @@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); */ static void worker_enter_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; BUG_ON(worker->flags & WORKER_IDLE); BUG_ON(!list_empty(&worker->entry) && @@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker) /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; - gcwq->nr_idle++; + pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ - list_add(&worker->entry, &gcwq->idle_list); + list_add(&worker->entry, &pool->idle_list); - if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) - mod_timer(&gcwq->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } else - wake_up_all(&gcwq->trustee_wait); + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because trustee releases gcwq->lock - * between setting %WORKER_ROGUE and zapping nr_running, the - * warning may trigger spuriously. Check iff trustee is idle. + * Sanity check nr_running. Because gcwq_unbind_fn() releases + * gcwq->lock between setting %WORKER_UNBOUND and zapping + * nr_running, the warning may trigger spuriously. Check iff + * unbind is not in progress. */ - WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && - gcwq->nr_workers == gcwq->nr_idle && - atomic_read(get_gcwq_nr_running(gcwq->cpu))); + WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && + pool->nr_workers == pool->nr_idle && + atomic_read(get_pool_nr_running(pool))); } /** @@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker) */ static void worker_leave_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; BUG_ON(!(worker->flags & WORKER_IDLE)); worker_clr_flags(worker, WORKER_IDLE); - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); } @@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker) * verbatim as it's best effort and blocking and gcwq may be * [dis]associated in the meantime. * - * This function tries set_cpus_allowed() and locks gcwq and verifies - * the binding against GCWQ_DISASSOCIATED which is set during - * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters - * idle state or fetches works without dropping lock, it can guarantee - * the scheduling requirement described in the first paragraph. + * This function tries set_cpus_allowed() and locks gcwq and verifies the + * binding against %GCWQ_DISASSOCIATED which is set during + * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker + * enters idle state or fetches works without dropping lock, it can + * guarantee the scheduling requirement described in the first paragraph. * * CONTEXT: * Might sleep. Called without any lock but returns with gcwq->lock @@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker) static bool worker_maybe_bind_and_lock(struct worker *worker) __acquires(&gcwq->lock) { - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; struct task_struct *task = worker->task; while (true) { @@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock) } } +struct idle_rebind { + int cnt; /* # workers to be rebound */ + struct completion done; /* all workers rebound */ +}; + +/* + * Rebind an idle @worker to its CPU. During CPU onlining, this has to + * happen synchronously for idle workers. worker_thread() will test + * %WORKER_REBIND before leaving idle and call this function. + */ +static void idle_worker_rebind(struct worker *worker) +{ + struct global_cwq *gcwq = worker->pool->gcwq; + + /* CPU must be online at this point */ + WARN_ON(!worker_maybe_bind_and_lock(worker)); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); + + /* we did our part, wait for rebind_workers() to finish up */ + wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); +} + /* - * Function for worker->rebind_work used to rebind rogue busy workers - * to the associated cpu which is coming back online. This is - * scheduled by cpu up but can race with other cpu hotplug operations - * and may be executed twice without intervening cpu down. + * Function for @worker->rebind.work used to rebind unbound busy workers to + * the associated cpu which is coming back online. This is scheduled by + * cpu up but can race with other cpu hotplug operations and may be + * executed twice without intervening cpu down. */ -static void worker_rebind_fn(struct work_struct *work) +static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_REBIND); @@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work) spin_unlock_irq(&gcwq->lock); } +/** + * rebind_workers - rebind all workers of a gcwq to the associated CPU + * @gcwq: gcwq of interest + * + * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding + * is different for idle and busy ones. + * + * The idle ones should be rebound synchronously and idle rebinding should + * be complete before any worker starts executing work items with + * concurrency management enabled; otherwise, scheduler may oops trying to + * wake up non-local idle worker from wq_worker_sleeping(). + * + * This is achieved by repeatedly requesting rebinding until all idle + * workers are known to have been rebound under @gcwq->lock and holding all + * idle workers from becoming busy until idle rebinding is complete. + * + * Once idle workers are rebound, busy workers can be rebound as they + * finish executing their current work items. Queueing the rebind work at + * the head of their scheduled lists is enough. Note that nr_running will + * be properbly bumped as busy workers rebind. + * + * On return, all workers are guaranteed to either be bound or have rebind + * work item scheduled. + */ +static void rebind_workers(struct global_cwq *gcwq) + __releases(&gcwq->lock) __acquires(&gcwq->lock) +{ + struct idle_rebind idle_rebind; + struct worker_pool *pool; + struct worker *worker; + struct hlist_node *pos; + int i; + + lockdep_assert_held(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) + lockdep_assert_held(&pool->manager_mutex); + + /* + * Rebind idle workers. Interlocked both ways. We wait for + * workers to rebind via @idle_rebind.done. Workers will wait for + * us to finish up by watching %WORKER_REBIND. + */ + init_completion(&idle_rebind.done); +retry: + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + /* set REBIND and kick idle ones, we'll wait for these later */ + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { + if (worker->flags & WORKER_REBIND) + continue; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + idle_rebind.cnt++; + worker->idle_rebind = &idle_rebind; + + /* worker_thread() will call idle_worker_rebind() */ + wake_up_process(worker->task); + } + } + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + /* busy ones might have become idle while waiting, retry */ + goto retry; + } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); + + /* rebind busy workers */ + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + /* wq doesn't matter, use the default one */ + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } +} + static struct worker *alloc_worker(void) { struct worker *worker; @@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, worker_rebind_fn); + INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void) /** * create_worker - create a new workqueue worker - * @gcwq: gcwq the new worker will belong to - * @bind: whether to set affinity to @cpu or not + * @pool: pool the new worker will belong to * - * Create a new worker which is bound to @gcwq. The returned worker + * Create a new worker which is bound to @pool. The returned worker * can be started by calling start_worker() or destroyed using * destroy_worker(). * @@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void) * RETURNS: * Pointer to the newly created worker. */ -static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +static struct worker *create_worker(struct worker_pool *pool) { - bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct global_cwq *gcwq = pool->gcwq; + const char *pri = worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; spin_lock_irq(&gcwq->lock); - while (ida_get_new(&gcwq->worker_ida, &id)) { + while (ida_get_new(&pool->worker_ida, &id)) { spin_unlock_irq(&gcwq->lock); - if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) goto fail; spin_lock_irq(&gcwq->lock); } @@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) if (!worker) goto fail; - worker->gcwq = gcwq; + worker->pool = pool; worker->id = id; - if (!on_unbound_cpu) + if (gcwq->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, - worker, - cpu_to_node(gcwq->cpu), - "kworker/%u:%d", gcwq->cpu, id); + worker, cpu_to_node(gcwq->cpu), + "kworker/%u:%d%s", gcwq->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d", id); + "kworker/u:%d%s", id, pri); if (IS_ERR(worker->task)) goto fail; + if (worker_pool_pri(pool)) + set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + /* - * A rogue worker will become a regular one if CPU comes - * online later on. Make sure every worker has - * PF_THREAD_BOUND set. + * Determine CPU binding of the new worker depending on + * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the + * flag remains stable across this function. See the comments + * above the flag definition for details. + * + * As an unbound worker may later become a regular one if CPU comes + * online, make sure every worker has %PF_THREAD_BOUND set. */ - if (bind && !on_unbound_cpu) + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { kthread_bind(worker->task, gcwq->cpu); - else { + } else { worker->task->flags |= PF_THREAD_BOUND; - if (on_unbound_cpu) - worker->flags |= WORKER_UNBOUND; + worker->flags |= WORKER_UNBOUND; } return worker; fail: if (id >= 0) { spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); spin_unlock_irq(&gcwq->lock); } kfree(worker); @@ -1424,7 +1555,7 @@ fail: static void start_worker(struct worker *worker) { worker->flags |= WORKER_STARTED; - worker->gcwq->nr_workers++; + worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); } @@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker) */ static void destroy_worker(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; int id = worker->id; /* sanity check frenzy */ @@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker) BUG_ON(!list_empty(&worker->scheduled)); if (worker->flags & WORKER_STARTED) - gcwq->nr_workers--; + pool->nr_workers--; if (worker->flags & WORKER_IDLE) - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); worker->flags |= WORKER_DIE; @@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker) kfree(worker); spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); } -static void idle_worker_timeout(unsigned long __gcwq) +static void idle_worker_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; spin_lock_irq(&gcwq->lock); - if (too_many_workers(gcwq)) { + if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&pool->idle_timer, expires); else { /* it's been idle for too long, wake up manager */ - gcwq->flags |= GCWQ_MANAGE_WORKERS; - wake_up_worker(gcwq); + pool->flags |= POOL_MANAGE_WORKERS; + wake_up_worker(pool); } } @@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work) return false; /* mayday mayday mayday */ - cpu = cwq->gcwq->cpu; + cpu = cwq->pool->gcwq->cpu; /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ if (cpu == WORK_CPU_UNBOUND) cpu = 0; @@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work) return true; } -static void gcwq_mayday_timeout(unsigned long __gcwq) +static void gcwq_mayday_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work; spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) { + if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ - list_for_each_entry(work, &gcwq->worklist, entry) + list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } spin_unlock_irq(&gcwq->lock); - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary - * @gcwq: gcwq to create a new worker for + * @pool: pool to create a new worker for * - * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is - * sent to all rescuers with works scheduled on @gcwq to resolve + * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be false and @@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_create_worker(struct global_cwq *gcwq) +static bool maybe_create_worker(struct worker_pool *pool) __releases(&gcwq->lock) __acquires(&gcwq->lock) { - if (!need_to_create_worker(gcwq)) + struct global_cwq *gcwq = pool->gcwq; + + if (!need_to_create_worker(pool)) return false; restart: spin_unlock_irq(&gcwq->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { struct worker *worker; - worker = create_worker(gcwq, true); + worker = create_worker(pool); if (worker) { - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); start_worker(worker); - BUG_ON(need_to_create_worker(gcwq)); + BUG_ON(need_to_create_worker(pool)); return true; } - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(CREATE_COOLDOWN); - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; } - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) + if (need_to_create_worker(pool)) goto restart; return true; } /** * maybe_destroy_worker - destroy workers which have been idle for a while - * @gcwq: gcwq to destroy workers for + * @pool: pool to destroy workers for * - * Destroy @gcwq workers which have been idle for longer than + * Destroy @pool workers which have been idle for longer than * IDLE_WORKER_TIMEOUT. * * LOCKING: @@ -1610,19 +1746,19 @@ restart: * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_destroy_workers(struct global_cwq *gcwq) +static bool maybe_destroy_workers(struct worker_pool *pool) { bool ret = false; - while (too_many_workers(gcwq)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&pool->idle_timer, expires); break; } @@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) */ static bool manage_workers(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; bool ret = false; - if (gcwq->flags & GCWQ_MANAGING_WORKERS) + if (!mutex_trylock(&pool->manager_mutex)) return ret; - gcwq->flags &= ~GCWQ_MANAGE_WORKERS; - gcwq->flags |= GCWQ_MANAGING_WORKERS; + pool->flags &= ~POOL_MANAGE_WORKERS; /* * Destroy and then create so that may_start_working() is true * on return. */ - ret |= maybe_destroy_workers(gcwq); - ret |= maybe_create_worker(gcwq); - - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* - * The trustee might be waiting to take over the manager - * position, tell it we're done. - */ - if (unlikely(gcwq->trustee)) - wake_up_all(&gcwq->trustee_wait); + ret |= maybe_destroy_workers(pool); + ret |= maybe_create_worker(pool); + mutex_unlock(&pool->manager_mutex); return ret; } @@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) { struct work_struct *work = list_first_entry(&cwq->delayed_works, struct work_struct, entry); - struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); trace_workqueue_activate_work(work); - move_linked_works(work, pos, NULL); + move_linked_works(work, &cwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; } @@ -1804,7 +1930,8 @@ __releases(&gcwq->lock) __acquires(&gcwq->lock) { struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; struct hlist_head *bwh = busy_worker_head(gcwq, work); bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; work_func_t f = work->func; @@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* + * Ensure we're on the correct CPU. DISASSOCIATED test is + * necessary to avoid spurious warnings from rescuers servicing the + * unbound or a disassociated gcwq. + */ + WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + !(gcwq->flags & GCWQ_DISASSOCIATED) && + raw_smp_processor_id() != gcwq->cpu); + + /* * A single work shouldn't be executed concurrently by * multiple workers on a single cpu. Check whether anyone is * already processing the work. If so, defer the work to the @@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock) list_del_init(&work->entry); /* - * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, - * wake up another worker; otherwise, clear HIGHPRI_PENDING. - */ - if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { - struct work_struct *nwork = list_first_entry(&gcwq->worklist, - struct work_struct, entry); - - if (!list_empty(&gcwq->worklist) && - get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) - wake_up_worker(gcwq); - else - gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; - } - - /* * CPU intensive works don't participate in concurrency * management. They're the scheduler's responsibility. */ if (unlikely(cpu_intensive)) worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + /* + * Unbound gcwq isn't concurrency managed and work items should be + * executed ASAP. Wake up another worker if necessary. + */ + if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) + wake_up_worker(pool); + spin_unlock_irq(&gcwq->lock); work_clear_pending(work); @@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker) static int worker_thread(void *__worker) { struct worker *worker = __worker; - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; /* tell the scheduler that this is a workqueue worker */ worker->task->flags |= PF_WQ_WORKER; woke_up: spin_lock_irq(&gcwq->lock); - /* DIE can be set only while we're idle, checking here is enough */ - if (worker->flags & WORKER_DIE) { + /* + * DIE can be set only while idle and REBIND set while busy has + * @worker->rebind_work scheduled. Checking here is enough. + */ + if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { spin_unlock_irq(&gcwq->lock); - worker->task->flags &= ~PF_WQ_WORKER; - return 0; + + if (worker->flags & WORKER_DIE) { + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + idle_worker_rebind(worker); + goto woke_up; } worker_leave_idle(worker); recheck: /* no more worker necessary? */ - if (!need_more_worker(gcwq)) + if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ - if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* @@ -1979,7 +2117,7 @@ recheck: do { struct work_struct *work = - list_first_entry(&gcwq->worklist, + list_first_entry(&pool->worklist, struct work_struct, entry); if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { @@ -1991,11 +2129,11 @@ recheck: move_linked_works(work, &worker->scheduled, NULL); process_scheduled_works(worker); } - } while (keep_working(gcwq)); + } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) goto recheck; /* @@ -2053,14 +2191,15 @@ repeat: for_each_mayday_cpu(cpu, wq->mayday_mask) { unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = cwq->pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); mayday_clear_cpu(cpu, wq->mayday_mask); /* migrate to the target cpu if possible */ - rescuer->gcwq = gcwq; + rescuer->pool = pool; worker_maybe_bind_and_lock(rescuer); /* @@ -2068,7 +2207,7 @@ repeat: * process'em. */ BUG_ON(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_cwq(work) == cwq) move_linked_works(work, scheduled, &n); @@ -2079,8 +2218,8 @@ repeat: * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ - if (keep_working(gcwq)) - wake_up_worker(gcwq); + if (keep_working(pool)) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct global_cwq *gcwq = cwq->pool->gcwq; spin_lock_irq(&gcwq->lock); @@ -2421,9 +2560,9 @@ reflush: struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); bool drained; - spin_lock_irq(&cwq->gcwq->lock); + spin_lock_irq(&cwq->pool->gcwq->lock); drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->gcwq->lock); + spin_unlock_irq(&cwq->pool->gcwq->lock); if (drained) continue; @@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, */ smp_rmb(); cwq = get_work_cwq(work); - if (unlikely(!cwq || gcwq != cwq->gcwq)) + if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; } else if (wait_executing) { worker = find_worker_executing_work(gcwq, work); @@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) flags |= WQ_RESCUER; - /* - * Unbound workqueues aren't concurrency managed and should be - * dispatched to workers immediately. - */ - if (flags & WQ_UNBOUND) - flags |= WQ_HIGHPRI; - max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct global_cwq *gcwq = get_gcwq(cpu); + int pool_idx = (bool)(flags & WQ_HIGHPRI); BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->gcwq = gcwq; + cwq->pool = &gcwq->pools[pool_idx]; cwq->wq = wq; cwq->flush_color = -1; cwq->max_active = max_active; @@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be detached from CPU, running - * it with unbound (rogue) workers and allowing it to be reattached - * later if the cpu comes back online. A separate thread is created - * to govern a gcwq in such state and is called the trustee of the - * gcwq. - * - * Trustee states and their descriptions. - * - * START Command state used on startup. On CPU_DOWN_PREPARE, a - * new trustee is started with this state. - * - * IN_CHARGE Once started, trustee will enter this state after - * assuming the manager role and making all existing - * workers rogue. DOWN_PREPARE waits for trustee to - * enter this state. After reaching IN_CHARGE, trustee - * tries to execute the pending worklist until it's empty - * and the state is set to BUTCHER, or the state is set - * to RELEASE. - * - * BUTCHER Command state which is set by the cpu callback after - * the cpu has went down. Once this state is set trustee - * knows that there will be no new works on the worklist - * and once the worklist is empty it can proceed to - * killing idle workers. - * - * RELEASE Command state which is set by the cpu callback if the - * cpu down has been canceled or it has come online - * again. After recognizing this state, trustee stops - * trying to drain or butcher and clears ROGUE, rebinds - * all remaining workers back to the cpu and releases - * manager role. - * - * DONE Trustee will enter this state after BUTCHER or RELEASE - * is complete. - * - * trustee CPU draining - * took over down complete - * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE - * | | ^ - * | CPU is back online v return workers | - * ----------------> RELEASE -------------- + * This is solved by allowing a gcwq to be disassociated from the CPU + * running as an unbound one and allowing it to be reattached later if the + * cpu comes back online. */ -/** - * trustee_wait_event_timeout - timed event wait for trustee - * @cond: condition to wait for - * @timeout: timeout in jiffies - * - * wait_event_timeout() for trustee to use. Handles locking and - * checks for RELEASE request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * Positive indicating left time if @cond is satisfied, 0 if timed - * out, -1 if canceled. - */ -#define trustee_wait_event_timeout(cond, timeout) ({ \ - long __ret = (timeout); \ - while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ - __ret) { \ - spin_unlock_irq(&gcwq->lock); \ - __wait_event_timeout(gcwq->trustee_wait, (cond) || \ - (gcwq->trustee_state == TRUSTEE_RELEASE), \ - __ret); \ - spin_lock_irq(&gcwq->lock); \ - } \ - gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ -}) - -/** - * trustee_wait_event - event wait for trustee - * @cond: condition to wait for - * - * wait_event() for trustee to use. Automatically handles locking and - * checks for CANCEL request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * 0 if @cond is satisfied, -1 if canceled. - */ -#define trustee_wait_event(cond) ({ \ - long __ret1; \ - __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ - __ret1 < 0 ? -1 : 0; \ -}) - -static int __cpuinit trustee_thread(void *__gcwq) +/* claim manager positions of all pools */ +static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) { - struct global_cwq *gcwq = __gcwq; - struct worker *worker; - struct work_struct *work; - struct hlist_node *pos; - long rc; - int i; - - BUG_ON(gcwq->cpu != smp_processor_id()); + struct worker_pool *pool; + for_each_worker_pool(pool, gcwq) + mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); spin_lock_irq(&gcwq->lock); - /* - * Claim the manager position and make all workers rogue. - * Trustee must be bound to the target cpu and can't be - * cancelled. - */ - BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); - BUG_ON(rc < 0); - - gcwq->flags |= GCWQ_MANAGING_WORKERS; - - list_for_each_entry(worker, &gcwq->idle_list, entry) - worker->flags |= WORKER_ROGUE; +} - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_ROGUE; +/* release manager positions */ +static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) +{ + struct worker_pool *pool; - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the rogue flag. This is - * necessary as scheduler callbacks may be invoked from other - * cpus. - */ spin_unlock_irq(&gcwq->lock); - schedule(); - spin_lock_irq(&gcwq->lock); + for_each_worker_pool(pool, gcwq) + mutex_unlock(&pool->manager_mutex); +} - /* - * Sched callbacks are disabled now. Zap nr_running. After - * this, nr_running stays zero and need_more_worker() and - * keep_working() are always true as long as the worklist is - * not empty. - */ - atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); +static void gcwq_unbind_fn(struct work_struct *work) +{ + struct global_cwq *gcwq = get_gcwq(smp_processor_id()); + struct worker_pool *pool; + struct worker *worker; + struct hlist_node *pos; + int i; - spin_unlock_irq(&gcwq->lock); - del_timer_sync(&gcwq->idle_timer); - spin_lock_irq(&gcwq->lock); + BUG_ON(gcwq->cpu != smp_processor_id()); - /* - * We're now in charge. Notify and proceed to drain. We need - * to keep the gcwq running during the whole CPU down - * procedure as other cpu hotunplug callbacks may need to - * flush currently running tasks. - */ - gcwq->trustee_state = TRUSTEE_IN_CHARGE; - wake_up_all(&gcwq->trustee_wait); + gcwq_claim_management_and_lock(gcwq); /* - * The original cpu is in the process of dying and may go away - * anytime now. When that happens, we and all workers would - * be migrated to other cpus. Try draining any left work. We - * want to get it over with ASAP - spam rescuers, wake up as - * many idlers as necessary and create new ones till the - * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezable cwqs. Don't declare - * completion while frozen. + * We've claimed all manager positions. Make all workers unbound + * and set DISASSOCIATED. Before this, all workers except for the + * ones which are still executing works from before the last CPU + * down must be on the cpu. After this, they may become diasporas. */ - while (gcwq->nr_workers != gcwq->nr_idle || - gcwq->flags & GCWQ_FREEZING || - gcwq->trustee_state == TRUSTEE_IN_CHARGE) { - int nr_works = 0; - - list_for_each_entry(work, &gcwq->worklist, entry) { - send_mayday(work); - nr_works++; - } + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags |= WORKER_UNBOUND; - list_for_each_entry(worker, &gcwq->idle_list, entry) { - if (!nr_works--) - break; - wake_up_process(worker->task); - } + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_UNBOUND; - if (need_to_create_worker(gcwq)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(gcwq, false); - spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_ROGUE; - start_worker(worker); - } - } + gcwq->flags |= GCWQ_DISASSOCIATED; - /* give a breather */ - if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) - break; - } + gcwq_release_management_and_unlock(gcwq); /* - * Either all works have been scheduled and cpu is down, or - * cpu down has already been canceled. Wait for and butcher - * all workers till we're canceled. + * Call schedule() so that we cross rq->lock and thus can guarantee + * sched callbacks see the %WORKER_UNBOUND flag. This is necessary + * as scheduler callbacks may be invoked from other cpus. */ - do { - rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); - while (!list_empty(&gcwq->idle_list)) - destroy_worker(list_first_entry(&gcwq->idle_list, - struct worker, entry)); - } while (gcwq->nr_workers && rc >= 0); + schedule(); /* - * At this point, either draining has completed and no worker - * is left, or cpu down has been canceled or the cpu is being - * brought back up. There shouldn't be any idle one left. - * Tell the remaining busy ones to rebind once it finishes the - * currently scheduled works by scheduling the rebind_work. + * Sched callbacks are disabled now. Zap nr_running. After this, + * nr_running stays zero and need_more_worker() and keep_working() + * are always true as long as the worklist is not empty. @gcwq now + * behaves as unbound (in terms of concurrency management) gcwq + * which is served by workers tied to the CPU. + * + * On return from this function, the current worker would trigger + * unbound chain execution of pending work items if other workers + * didn't already. */ - WARN_ON(!list_empty(&gcwq->idle_list)); - - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; - - /* - * Rebind_work may race with future cpu hotplug - * operations. Use a separate flag to mark that - * rebinding is scheduled. - */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; - - /* queue rebind_work, wq doesn't matter, use the default one */ - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - - /* relinquish manager role */ - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* notify completion */ - gcwq->trustee = NULL; - gcwq->trustee_state = TRUSTEE_DONE; - wake_up_all(&gcwq->trustee_wait); - spin_unlock_irq(&gcwq->lock); - return 0; + for_each_worker_pool(pool, gcwq) + atomic_set(get_pool_nr_running(pool), 0); } -/** - * wait_trustee_state - wait for trustee to enter the specified state - * @gcwq: gcwq the trustee of interest belongs to - * @state: target state to wait for - * - * Wait for the trustee to reach @state. DONE is already matched. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by cpu_callback. +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. */ -static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!(gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE)) { - spin_unlock_irq(&gcwq->lock); - __wait_event(gcwq->trustee_wait, - gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE); - spin_lock_irq(&gcwq->lock); - } -} - -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); - struct task_struct *new_trustee = NULL; - struct worker *uninitialized_var(new_worker); - unsigned long flags; - - action &= ~CPU_TASKS_FROZEN; + struct worker_pool *pool; - switch (action) { - case CPU_DOWN_PREPARE: - new_trustee = kthread_create(trustee_thread, gcwq, - "workqueue_trustee/%d\n", cpu); - if (IS_ERR(new_trustee)) - return notifier_from_errno(PTR_ERR(new_trustee)); - kthread_bind(new_trustee, cpu); - /* fall through */ + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - new_worker = create_worker(gcwq, false); - if (!new_worker) { - if (new_trustee) - kthread_stop(new_trustee); - return NOTIFY_BAD; - } - } - - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); + for_each_worker_pool(pool, gcwq) { + struct worker *worker; - switch (action) { - case CPU_DOWN_PREPARE: - /* initialize trustee and tell it to acquire the gcwq */ - BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); - gcwq->trustee = new_trustee; - gcwq->trustee_state = TRUSTEE_START; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - /* fall through */ - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; - break; + if (pool->nr_workers) + continue; - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; + worker = create_worker(pool); + if (!worker) + return NOTIFY_BAD; - case CPU_POST_DEAD: - gcwq->trustee_state = TRUSTEE_BUTCHER; - /* fall through */ - case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } break; case CPU_DOWN_FAILED: case CPU_ONLINE: + gcwq_claim_management_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; - if (gcwq->trustee_state != TRUSTEE_DONE) { - gcwq->trustee_state = TRUSTEE_RELEASE; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_DONE); - } - - /* - * Trustee is done and there might be no worker left. - * Put the first_idle in and request a real manager to - * take a look. - */ - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + rebind_workers(gcwq); + gcwq_release_management_and_unlock(gcwq); break; } + return NOTIFY_OK; +} - spin_unlock_irqrestore(&gcwq->lock, flags); +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; - return notifier_from_errno(0); + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* unbinding should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + schedule_work_on(cpu, &unbind_work); + flush_work(&unbind_work); + break; + } + return NOTIFY_OK; } #ifdef CONFIG_SMP @@ -3746,6 +3653,7 @@ void thaw_workqueues(void) for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; struct workqueue_struct *wq; spin_lock_irq(&gcwq->lock); @@ -3767,7 +3675,8 @@ void thaw_workqueues(void) cwq_activate_first_delayed(cwq); } - wake_up_worker(gcwq); + for_each_worker_pool(pool, gcwq) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -3783,46 +3692,57 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; spin_lock_init(&gcwq->lock); - INIT_LIST_HEAD(&gcwq->worklist); gcwq->cpu = cpu; gcwq->flags |= GCWQ_DISASSOCIATED; - INIT_LIST_HEAD(&gcwq->idle_list); for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - init_timer_deferrable(&gcwq->idle_timer); - gcwq->idle_timer.function = idle_worker_timeout; - gcwq->idle_timer.data = (unsigned long)gcwq; + for_each_worker_pool(pool, gcwq) { + pool->gcwq = gcwq; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; - setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, - (unsigned long)gcwq); + setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, + (unsigned long)pool); - ida_init(&gcwq->worker_ida); + mutex_init(&pool->manager_mutex); + ida_init(&pool->worker_ida); + } - gcwq->trustee_state = TRUSTEE_DONE; - init_waitqueue_head(&gcwq->trustee_wait); + init_waitqueue_head(&gcwq->rebind_hold); } /* create the initial worker */ for_each_online_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *worker; + struct worker_pool *pool; if (cpu != WORK_CPU_UNBOUND) gcwq->flags &= ~GCWQ_DISASSOCIATED; - worker = create_worker(gcwq, true); - BUG_ON(!worker); - spin_lock_irq(&gcwq->lock); - start_worker(worker); - spin_unlock_irq(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) { + struct worker *worker; + + worker = create_worker(pool); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } } system_wq = alloc_workqueue("events", 0, 0); |