diff options
Diffstat (limited to 'kernel')
67 files changed, 3106 insertions, 1613 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index f2ab700..ab4f109 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -3,3 +3,4 @@ # config_data.h config_data.gz +timeconst.h diff --git a/kernel/Makefile b/kernel/Makefile index 135a1b9..6c584c5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,12 +4,12 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ + sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o \ - utsname.o notifier.o ksysfs.o pm_qos_params.o + notifier.o ksysfs.o pm_qos_params.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -42,7 +42,11 @@ obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o +obj-$(CONFIG_UTS_NS) += utsname.o +obj-$(CONFIG_USER_NS) += user_namespace.o +obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -87,3 +91,11 @@ quiet_cmd_ikconfiggz = IKCFG $@ targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call if_changed,ikconfiggz) + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_timeconst = TIMEC $@ + cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ +targets += timeconst.h +$(obj)/timeconst.h: $(src)/timeconst.pl FORCE + $(call if_changed,timeconst) diff --git a/kernel/audit.c b/kernel/audit.c index c8555b1..2eeea9a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1312,26 +1312,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct dentry *dentry, struct vfsmount *vfsmnt) + struct path *path) { - char *p, *path; + char *p, *pathname; if (prefix) audit_log_format(ab, " %s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ - path = kmalloc(PATH_MAX+11, ab->gfp_mask); - if (!path) { + pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); + if (!pathname) { audit_log_format(ab, "<no memory>"); return; } - p = d_path(dentry, vfsmnt, path, PATH_MAX+11); + p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "<too long>"); } else audit_log_untrustedstring(ab, p); - kfree(path); + kfree(pathname); } /** diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f4fcf58..9ef5e0a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -549,8 +549,8 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(nd.mnt, nd.dentry); - path_release(&nd); + root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); + path_put(&nd.path); if (!root_mnt) goto skip_it; @@ -583,17 +583,17 @@ skip_it: static int is_under(struct vfsmount *mnt, struct dentry *dentry, struct nameidata *nd) { - if (mnt != nd->mnt) { + if (mnt != nd->path.mnt) { for (;;) { if (mnt->mnt_parent == mnt) return 0; - if (mnt->mnt_parent == nd->mnt) + if (mnt->mnt_parent == nd->path.mnt) break; mnt = mnt->mnt_parent; } dentry = mnt->mnt_mountpoint; } - return is_subdir(dentry, nd->dentry); + return is_subdir(dentry, nd->path.dentry); } int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) @@ -641,8 +641,8 @@ int audit_add_tree_rule(struct audit_krule *rule) err = path_lookup(tree->pathname, 0, &nd); if (err) goto Err; - mnt = collect_mounts(nd.mnt, nd.dentry); - path_release(&nd); + mnt = collect_mounts(nd.path.mnt, nd.path.dentry); + path_put(&nd.path); if (!mnt) { err = -ENOMEM; goto Err; @@ -701,8 +701,8 @@ int audit_tag_tree(char *old, char *new) err = path_lookup(new, 0, &nd); if (err) return err; - tagged = collect_mounts(nd.mnt, nd.dentry); - path_release(&nd); + tagged = collect_mounts(nd.path.mnt, nd.path.dentry); + path_put(&nd.path); if (!tagged) return -ENOMEM; @@ -711,9 +711,9 @@ int audit_tag_tree(char *old, char *new) drop_collected_mounts(tagged); return err; } - mnt = mntget(nd.mnt); - dentry = dget(nd.dentry); - path_release(&nd); + mnt = mntget(nd.path.mnt); + dentry = dget(nd.path.dentry); + path_put(&nd.path); if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) follow_up(&mnt, &dentry); @@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new) spin_lock(&vfsmount_lock); if (!is_under(mnt, dentry, &nd)) { spin_unlock(&vfsmount_lock); - path_release(&nd); + path_put(&nd.path); put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } spin_unlock(&vfsmount_lock); - path_release(&nd); + path_put(&nd.path); list_for_each_entry(p, &list, mnt_list) { failed = tag_chunk(p->mnt_root->d_inode, tree); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6f19fd4..2f2914b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp) inotify_init_watch(&parent->wdata); /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, - AUDIT_IN_WATCH); + wd = inotify_add_watch(audit_ih, &parent->wdata, + ndp->path.dentry->d_inode, AUDIT_IN_WATCH); if (wd < 0) { audit_free_parent(&parent->wdata); return ERR_PTR(wd); @@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp, static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) { if (ndp) { - path_release(ndp); + path_put(&ndp->path); kfree(ndp); } if (ndw) { - path_release(ndw); + path_put(&ndw->path); kfree(ndw); } } @@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, /* update watch filter fields */ if (ndw) { - watch->dev = ndw->dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->dentry->d_inode->i_ino; + watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->path.dentry->d_inode->i_ino; } /* The audit_filter_mutex must not be held during inotify calls because @@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, */ mutex_unlock(&audit_filter_mutex); - if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { + if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, + &i_watch) < 0) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { /* caller expects mutex locked */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c06ecf..ac6d9b2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -208,8 +208,7 @@ struct audit_context { int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ - struct dentry * pwd; - struct vfsmount * pwdmnt; + struct path pwd; struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; @@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context) __putname(context->names[i].name); } context->name_count = 0; - if (context->pwd) - dput(context->pwd); - if (context->pwdmnt) - mntput(context->pwdmnt); - context->pwd = NULL; - context->pwdmnt = NULL; + path_put(&context->pwd); + context->pwd.dentry = NULL; + context->pwd.mnt = NULL; } static inline void audit_free_aux(struct audit_context *context) @@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_path.dentry, - vma->vm_file->f_path.mnt); + &vma->vm_file->f_path); break; } vma = vma->vm_next; @@ -1341,10 +1336,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->target_sid, context->target_comm)) call_panic = 1; - if (context->pwd && context->pwdmnt) { + if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); + audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } @@ -1367,8 +1362,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case 0: /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, " name=", context->pwd, - context->pwdmnt); + audit_log_d_path(ab, " name=", &context->pwd); break; default: /* log the name's directory component */ @@ -1695,10 +1689,10 @@ void __audit_getname(const char *name) context->names[context->name_count].ino = (unsigned long)-1; context->names[context->name_count].osid = 0; ++context->name_count; - if (!context->pwd) { + if (!context->pwd.dentry) { read_lock(¤t->fs->lock); - context->pwd = dget(current->fs->pwd); - context->pwdmnt = mntget(current->fs->pwdmnt); + context->pwd = current->fs->pwd; + path_get(¤t->fs->pwd); read_unlock(¤t->fs->lock); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a3c239..4766bb6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -141,7 +141,7 @@ enum { ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ }; -inline int cgroup_is_releasable(const struct cgroup *cgrp) +static int cgroup_is_releasable(const struct cgroup *cgrp) { const int bits = (1 << CGRP_RELEASABLE) | @@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp) return (cgrp->flags & bits) == bits; } -inline int notify_on_release(const struct cgroup *cgrp) +static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } @@ -489,7 +489,7 @@ static struct css_set *find_css_set( * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only - * attach_task() can increment it again. Because a count of zero + * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely @@ -520,17 +520,17 @@ static struct css_set *find_css_set( * The task_lock() exception * * The need for this exception arises from the action of - * attach_task(), which overwrites one tasks cgroup pointer with + * cgroup_attach_task(), which overwrites one tasks cgroup pointer with * another. It does so using cgroup_mutexe, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cgroup pointer by attach_task() + * update of a tasks cgroup pointer by cgroup_attach_task() */ /** @@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) return inode; } +/* + * Call subsys's pre_destroy handler. + * This is called before css refcnt check. + */ + +static void cgroup_call_pre_destroy(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + for_each_subsys(cgrp->root, ss) + if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) + ss->pre_destroy(ss, cgrp); + return; +} + + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ if (S_ISDIR(inode->i_mode)) { struct cgroup *cgrp = dentry->d_fsdata; + struct cgroup_subsys *ss; BUG_ON(!(cgroup_is_removed(cgrp))); /* It's possible for external users to be holding css * reference counts on a cgroup; css_put() needs to @@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * queue the cgroup to be handled by the release * agent */ synchronize_rcu(); + + mutex_lock(&cgroup_mutex); + /* + * Release the subsystem state objects. + */ + for_each_subsys(cgrp->root, ss) { + if (cgrp->subsys[ss->subsys_id]) + ss->destroy(ss, cgrp); + } + + cgrp->root->number_of_cgroups--; + mutex_unlock(&cgroup_mutex); + + /* Drop the active superblock reference that we took when we + * created the cgroup */ + deactivate_super(cgrp->root->sb); + kfree(cgrp); } iput(inode); @@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp, * Call holding cgroup_mutex. May take task_lock of * the task 'pid' during call. */ -static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) +int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { int retval = 0; struct cgroup_subsys *ss; @@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { retval = ss->can_attach(ss, cgrp, tsk); - if (retval) { + if (retval) return retval; - } } } @@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) * based on its final set of cgroups */ newcg = find_css_set(cg, cgrp); - if (!newcg) { + if (!newcg) return -ENOMEM; - } task_lock(tsk); if (tsk->flags & PF_EXITING) { @@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) write_unlock(&css_set_lock); for_each_subsys(root, ss) { - if (ss->attach) { + if (ss->attach) ss->attach(ss, cgrp, oldcgrp, tsk); - } } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) if (pid) { rcu_read_lock(); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (!tsk || tsk->flags & PF_EXITING) { rcu_read_unlock(); return -ESRCH; @@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) get_task_struct(tsk); } - ret = attach_task(cgrp, tsk); + ret = cgroup_attach_task(cgrp, tsk); put_task_struct(tsk); return ret; } @@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, goto out1; } buffer[nbytes] = 0; /* nul-terminate */ + strstrip(buffer); /* strip -just- trailing whitespace */ mutex_lock(&cgroup_mutex); + /* + * This was already checked for in cgroup_file_write(), but + * check again now we're holding cgroup_mutex. + */ if (cgroup_is_removed(cgrp)) { retval = -ENODEV; goto out2; @@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); break; case FILE_RELEASE_AGENT: - { - struct cgroupfs_root *root = cgrp->root; - /* Strip trailing newline */ - if (nbytes && (buffer[nbytes-1] == '\n')) { - buffer[nbytes-1] = 0; - } - if (nbytes < sizeof(root->release_agent_path)) { - /* We never write anything other than '\0' - * into the last char of release_agent_path, - * so it always remains a NUL-terminated - * string */ - strncpy(root->release_agent_path, buffer, nbytes); - root->release_agent_path[nbytes] = 0; - } else { - retval = -ENOSPC; - } + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + strcpy(cgrp->root->release_agent_path, buffer); break; - } default: retval = -EINVAL; goto out2; @@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft) + if (!cft || cgroup_is_removed(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft) + if (!cft || cgroup_is_removed(cgrp)) return -ENODEV; if (cft->read) @@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp, it->task = cg->tasks.next; } +/* + * To reduce the fork() overhead for systems that are not actually + * using their cgroups capability, we don't maintain the lists running + * through each css_set to its tasks until we see the list actually + * used - in other words after the first call to cgroup_iter_start(). + * + * The tasklist_lock is not held here, as do_each_thread() and + * while_each_thread() are protected by RCU. + */ +void cgroup_enable_task_cg_lists(void) +{ + struct task_struct *p, *g; + write_lock(&css_set_lock); + use_task_css_set_links = 1; + do_each_thread(g, p) { + task_lock(p); + if (list_empty(&p->cg_list)) + list_add(&p->cg_list, &p->cgroups->tasks); + task_unlock(p); + } while_each_thread(g, p); + write_unlock(&css_set_lock); +} + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) { /* @@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) * we need to enable the list linking each css_set to its * tasks, and fix up all existing tasks. */ - if (!use_task_css_set_links) { - struct task_struct *p, *g; - write_lock(&css_set_lock); - use_task_css_set_links = 1; - do_each_thread(g, p) { - task_lock(p); - if (list_empty(&p->cg_list)) - list_add(&p->cg_list, &p->cgroups->tasks); - task_unlock(p); - } while_each_thread(g, p); - write_unlock(&css_set_lock); - } + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); + read_lock(&css_set_lock); it->cg_link = &cgrp->css_sets; cgroup_advance_iter(cgrp, it); @@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) read_unlock(&css_set_lock); } +static inline int started_after_time(struct task_struct *t1, + struct timespec *time, + struct task_struct *t2) +{ + int start_diff = timespec_compare(&t1->start_time, time); + if (start_diff > 0) { + return 1; + } else if (start_diff < 0) { + return 0; + } else { + /* + * Arbitrarily, if two processes started at the same + * time, we'll say that the lower pointer value + * started first. Note that t2 may have exited by now + * so this may not be a valid pointer any longer, but + * that's fine - it still serves to distinguish + * between two tasks started (effectively) simultaneously. + */ + return t1 > t2; + } +} + +/* + * This function is a callback from heap_insert() and is used to order + * the heap. + * In this case we order the heap in descending task start time. + */ +static inline int started_after(void *p1, void *p2) +{ + struct task_struct *t1 = p1; + struct task_struct *t2 = p2; + return started_after_time(t1, &t2->start_time, t2); +} + +/** + * cgroup_scan_tasks - iterate though all the tasks in a cgroup + * @scan: struct cgroup_scanner containing arguments for the scan + * + * Arguments include pointers to callback functions test_task() and + * process_task(). + * Iterate through all the tasks in a cgroup, calling test_task() for each, + * and if it returns true, call process_task() for it also. + * The test_task pointer may be NULL, meaning always true (select all tasks). + * Effectively duplicates cgroup_iter_{start,next,end}() + * but does not lock css_set_lock for the call to process_task(). + * The struct cgroup_scanner may be embedded in any structure of the caller's + * creation. + * It is guaranteed that process_task() will act on every task that + * is a member of the cgroup for the duration of this call. This + * function may or may not call process_task() for tasks that exit + * or move to a different cgroup during the call, or are forked or + * move into the cgroup during the call. + * + * Note that test_task() may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should + * be cheap. + * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been + * pre-allocated and will be used for heap operations (and its "gt" member will + * be overwritten), else a temporary heap will be used (allocation of which + * may cause this function to fail). + */ +int cgroup_scan_tasks(struct cgroup_scanner *scan) +{ + int retval, i; + struct cgroup_iter it; + struct task_struct *p, *dropped; + /* Never dereference latest_task, since it's not refcounted */ + struct task_struct *latest_task = NULL; + struct ptr_heap tmp_heap; + struct ptr_heap *heap; + struct timespec latest_time = { 0, 0 }; + + if (scan->heap) { + /* The caller supplied our heap and pre-allocated its memory */ + heap = scan->heap; + heap->gt = &started_after; + } else { + /* We need to allocate our own heap memory */ + heap = &tmp_heap; + retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); + if (retval) + /* cannot allocate the heap */ + return retval; + } + + again: + /* + * Scan tasks in the cgroup, using the scanner's "test_task" callback + * to determine which are of interest, and using the scanner's + * "process_task" callback to process any of them that need an update. + * Since we don't want to hold any locks during the task updates, + * gather tasks to be processed in a heap structure. + * The heap is sorted by descending task start time. + * If the statically-sized heap fills up, we overflow tasks that + * started later, and in future iterations only consider tasks that + * started after the latest task in the previous pass. This + * guarantees forward progress and that we don't miss any tasks. + */ + heap->size = 0; + cgroup_iter_start(scan->cg, &it); + while ((p = cgroup_iter_next(scan->cg, &it))) { + /* + * Only affect tasks that qualify per the caller's callback, + * if he provided one + */ + if (scan->test_task && !scan->test_task(p, scan)) + continue; + /* + * Only process tasks that started after the last task + * we processed + */ + if (!started_after_time(p, &latest_time, latest_task)) + continue; + dropped = heap_insert(heap, p); + if (dropped == NULL) { + /* + * The new task was inserted; the heap wasn't + * previously full + */ + get_task_struct(p); + } else if (dropped != p) { + /* + * The new task was inserted, and pushed out a + * different task + */ + get_task_struct(p); + put_task_struct(dropped); + } + /* + * Else the new task was newer than anything already in + * the heap and wasn't inserted + */ + } + cgroup_iter_end(scan->cg, &it); + + if (heap->size) { + for (i = 0; i < heap->size; i++) { + struct task_struct *p = heap->ptrs[i]; + if (i == 0) { + latest_time = p->start_time; + latest_task = p; + } + /* Process the task per the caller's callback */ + scan->process_task(p, scan); + put_task_struct(p); + } + /* + * If we had to process any tasks at all, scan again + * in case some of them were in the middle of forking + * children that didn't get processed. + * Not the most efficient way to do it, but it avoids + * having to take callback_mutex in the fork path + */ + goto again; + } + if (heap == &tmp_heap) + heap_free(&tmp_heap); + return 0; +} + /* * Stuff for reading the 'tasks' file. * @@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; - pidarray[n++] = task_pid_nr(tsk); + pidarray[n++] = task_pid_vnr(tsk); } cgroup_iter_end(cgrp, &it); return n; @@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp) * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) { + if (css && atomic_read(&css->refcnt)) return 1; - } } return 0; } @@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; - struct cgroup_subsys *ss; struct super_block *sb; struct cgroupfs_root *root; @@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) parent = cgrp->parent; root = cgrp->root; sb = root->sb; + /* + * Call pre_destroy handlers of subsys + */ + cgroup_call_pre_destroy(cgrp); + /* + * Notify subsyses that rmdir() request comes. + */ if (cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } - for_each_subsys(root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } - spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) @@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) cgroup_d_remove_dir(d); dput(d); - root->number_of_cgroups--; set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); mutex_unlock(&cgroup_mutex); - /* Drop the active superblock reference that we took when we - * created the cgroup */ - deactivate_super(sb); return 0; } @@ -2324,7 +2514,7 @@ out: * - Used for /proc/<pid>/cgroup. * - No need to task_lock(tsk) on this tsk->cgroup reference, as it * doesn't really matter if tsk->cgroup changes after we read it, - * and we take cgroup_mutex, keeping attach_task() from changing it + * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it * anyway. No need to check that tsk->cgroup != NULL, thanks to * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks * cgroup to top_cgroup. @@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = { * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. attach_task() might + * might no longer be a valid cgroup pointer. cgroup_attach_task() might * have already changed current->cgroups, allowing the previously * referenced cgroup group to be removed and freed. * @@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child) * attach us to a different cgroup, decrementing the count on * the first cgroup that we never incremented. But in this case, * top_cgroup isn't going away, and either task has PF_EXITING set, - * which wards off any attach_task() attempts, or task is a failed - * fork, never visible to attach_task. + * which wards off any cgroup_attach_task() attempts, or task is a failed + * fork, never visible to cgroup_attach_task. * */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) @@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) } /* All seems fine. Finish by moving the task into the new cgroup */ - ret = attach_task(child, tsk); + ret = cgroup_attach_task(child, tsk); mutex_unlock(&cgroup_mutex); out_release: diff --git a/kernel/compat.c b/kernel/compat.c index 42a1ed4..5f0e201 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -40,10 +40,35 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +static long compat_nanosleep_restart(struct restart_block *restart) +{ + struct compat_timespec __user *rmtp; + struct timespec rmt; + mm_segment_t oldfs; + long ret; + + rmtp = (struct compat_timespec __user *)(restart->arg1); + restart->arg1 = (unsigned long)&rmt; + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = hrtimer_nanosleep_restart(restart); + set_fs(oldfs); + + if (ret) { + restart->arg1 = (unsigned long)rmtp; + + if (rmtp && put_compat_timespec(&rmt, rmtp)) + return -EFAULT; + } + + return ret; +} + asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) { struct timespec tu, rmt; + mm_segment_t oldfs; long ret; if (get_compat_timespec(&tu, rqtp)) @@ -52,11 +77,21 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, if (!timespec_valid(&tu)) return -EINVAL; - ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = hrtimer_nanosleep(&tu, + rmtp ? (struct timespec __user *)&rmt : NULL, + HRTIMER_MODE_REL, CLOCK_MONOTONIC); + set_fs(oldfs); + + if (ret) { + struct restart_block *restart + = ¤t_thread_info()->restart_block; + + restart->fn = compat_nanosleep_restart; + restart->arg1 = (unsigned long)rmtp; - if (ret && rmtp) { - if (put_compat_timespec(&rmt, rmtp)) + if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; } diff --git a/kernel/cpu.c b/kernel/cpu.c index e0d3a4f..2eff3f6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -389,7 +389,7 @@ int disable_nonboot_cpus(void) return error; } -void enable_nonboot_cpus(void) +void __ref enable_nonboot_cpus(void) { int cpu, error; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cfaf641..3e296ed 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -38,7 +38,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/pagemap.h> -#include <linux/prio_heap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -56,6 +55,8 @@ #include <asm/atomic.h> #include <linux/mutex.h> #include <linux/kfifo.h> +#include <linux/workqueue.h> +#include <linux/cgroup.h> /* * Tracks how many cpusets are currently defined in system. @@ -64,7 +65,7 @@ */ int number_of_cpusets __read_mostly; -/* Retrieve the cpuset from a cgroup */ +/* Forward declare cgroup structures */ struct cgroup_subsys cpuset_subsys; struct cpuset; @@ -96,6 +97,9 @@ struct cpuset { /* partition number for rebuild_sched_domains() */ int pn; + + /* used for walking a cpuset heirarchy */ + struct list_head stack_list; }; /* Retrieve the cpuset for a cgroup */ @@ -111,7 +115,10 @@ static inline struct cpuset *task_cs(struct task_struct *task) return container_of(task_subsys_state(task, cpuset_subsys_id), struct cpuset, css); } - +struct cpuset_hotplug_scanner { + struct cgroup_scanner scan; + struct cgroup *to; +}; /* bits in struct cpuset flags field */ typedef enum { @@ -160,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs) * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. * - * A single, global generation is needed because attach_task() could + * A single, global generation is needed because cpuset_attach_task() could * reattach a task to a different cpuset, which must not have its * generation numbers aliased with those of that tasks previous cpuset. * * Generations are needed for mems_allowed because one task cannot - * modify anothers memory placement. So we must enable every task, + * modify another's memory placement. So we must enable every task, * on every visit to __alloc_pages(), to efficiently check whether * its current->cpuset->mems_allowed has changed, requiring an update * of its current->mems_allowed. * - * Since cpuset_mems_generation is guarded by manage_mutex, + * Since writes to cpuset_mems_generation are guarded by the cgroup lock * there is no need to mark it atomic. */ static int cpuset_mems_generation; @@ -182,17 +189,20 @@ static struct cpuset top_cpuset = { }; /* - * We have two global cpuset mutexes below. They can nest. - * It is ok to first take manage_mutex, then nest callback_mutex. We also - * require taking task_lock() when dereferencing a tasks cpuset pointer. - * See "The task_lock() exception", at the end of this comment. + * There are two global mutexes guarding cpuset structures. The first + * is the main control groups cgroup_mutex, accessed via + * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific + * callback_mutex, below. They can nest. It is ok to first take + * cgroup_mutex, then nest callback_mutex. We also require taking + * task_lock() when dereferencing a task's cpuset pointer. See "The + * task_lock() exception", at the end of this comment. * * A task must hold both mutexes to modify cpusets. If a task - * holds manage_mutex, then it blocks others wanting that mutex, + * holds cgroup_mutex, then it blocks others wanting that mutex, * ensuring that it is the only task able to also acquire callback_mutex * and be able to modify cpusets. It can perform various checks on * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding manage_mutex. While it is + * also allocate memory while just holding cgroup_mutex. While it is * performing these checks, various callback routines can briefly * acquire callback_mutex to query cpusets. Once it is ready to make * the changes, it takes callback_mutex, blocking everyone else. @@ -208,60 +218,16 @@ static struct cpuset top_cpuset = { * The task_struct fields mems_allowed and mems_generation may only * be accessed in the context of that task, so require no locks. * - * Any task can increment and decrement the count field without lock. - * So in general, code holding manage_mutex or callback_mutex can't rely - * on the count field not changing. However, if the count goes to - * zero, then only attach_task(), which holds both mutexes, can - * increment it again. Because a count of zero means that no tasks - * are currently attached, therefore there is no way a task attached - * to that cpuset can fork (the other way to increment the count). - * So code holding manage_mutex or callback_mutex can safely assume that - * if the count is zero, it will stay zero. Similarly, if a task - * holds manage_mutex or callback_mutex on a cpuset with zero count, it - * knows that the cpuset won't be removed, as cpuset_rmdir() needs - * both of those mutexes. - * * The cpuset_common_file_write handler for operations that modify - * the cpuset hierarchy holds manage_mutex across the entire operation, + * the cpuset hierarchy holds cgroup_mutex across the entire operation, * single threading all such cpuset modifications across the system. * * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * - * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't - * (usually) take either mutex. These are the two most performance - * critical pieces of code here. The exception occurs on cpuset_exit(), - * when a task in a notify_on_release cpuset exits. Then manage_mutex - * is taken, and if the cpuset count is zero, a usermode call made - * to /sbin/cpuset_release_agent with the name of the cpuset (path - * relative to the root of cpuset file system) as the argument. - * - * A cpuset can only be deleted if both its 'count' of using tasks - * is zero, and its list of 'children' cpusets is empty. Since all - * tasks in the system use _some_ cpuset, and since there is always at - * least one task in the system (init), therefore, top_cpuset - * always has either children cpusets and/or using tasks. So we don't - * need a special hack to ensure that top_cpuset cannot be deleted. - * - * The above "Tale of Two Semaphores" would be complete, but for: - * - * The task_lock() exception - * - * The need for this exception arises from the action of attach_task(), - * which overwrites one tasks cpuset pointer with another. It does - * so using both mutexes, however there are several performance - * critical places that need to reference task->cpuset without the - * expense of grabbing a system global mutex. Therefore except as - * noted below, when dereferencing or, as in attach_task(), modifying - * a tasks cpuset pointer we use task_lock(), which acts on a spinlock - * (task->alloc_lock) already in the task_struct routinely used for - * such matters. - * - * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cpuset pointer by attach_task() and the - * access of task->cpuset->mems_generation via that pointer in - * the routine cpuset_update_task_memory_state(). + * Accessing a task's cpuset should be done in accordance with the + * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(callback_mutex); @@ -354,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Do not call this routine if in_interrupt(). * * Call without callback_mutex or task_lock() held. May be - * called with or without manage_mutex held. Thanks in part to - * 'the_top_cpuset_hack', the tasks cpuset pointer will never + * called with or without cgroup_mutex held. Thanks in part to + * 'the_top_cpuset_hack', the task's cpuset pointer will never * be NULL. This routine also might acquire callback_mutex and * current->mm->mmap_sem during call. * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded - * from concurrent freeing of current->cpuset by attach_task(), - * using RCU. + * from concurrent freeing of current->cpuset using RCU. * * The rcu_dereference() is technically probably not needed, * as I don't actually mind if I see a new cpuset pointer but @@ -424,7 +389,7 @@ void cpuset_update_task_memory_state(void) * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding manage_mutex. + * are only set if the other's are set. Call holding cgroup_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -442,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * manage_mutex held. + * cgroup_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -476,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) if (!is_cpuset_subset(trial, par)) return -EACCES; - /* If either I or some sibling (!= me) is exclusive, we can't overlap */ + /* + * If either I or some sibling (!= me) is exclusive, we can't + * overlap + */ list_for_each_entry(cont, &par->css.cgroup->children, sibling) { c = cgroup_cs(cont); if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && @@ -732,22 +700,50 @@ static inline int started_after(void *p1, void *p2) return started_after_time(t1, &t2->start_time, t2); } -/* - * Call with manage_mutex held. May take callback_mutex during call. +/** + * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's + * @tsk: task to test + * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner + * + * Call with cgroup_mutex held. May take callback_mutex during call. + * Called for each task in a cgroup by cgroup_scan_tasks(). + * Return nonzero if this tasks's cpus_allowed mask should be changed (in other + * words, if its mask is not equal to its cpuset's mask). + */ +int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) +{ + return !cpus_equal(tsk->cpus_allowed, + (cgroup_cs(scan->cg))->cpus_allowed); +} + +/** + * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's + * @tsk: task to test + * @scan: struct cgroup_scanner containing the cgroup of the task + * + * Called by cgroup_scan_tasks() for each task in a cgroup whose + * cpus_allowed mask needs to be changed. + * + * We don't need to re-check for the cgroup/cpuset membership, since we're + * holding cgroup_lock() at this point. */ +void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) +{ + set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); +} +/** + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it + * @cs: the cpuset to consider + * @buf: buffer of cpu numbers written to this cpuset + */ static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; - int retval, i; - int is_load_balanced; - struct cgroup_iter it; - struct cgroup *cgrp = cs->css.cgroup; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; + struct cgroup_scanner scan; struct ptr_heap heap; - struct timespec latest_time = { 0, 0 }; + int retval; + int is_load_balanced; /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ if (cs == &top_cpuset) @@ -756,7 +752,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) trialcs = *cs; /* - * An empty cpus_allowed is ok iff there are no tasks in the cpuset. + * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. @@ -777,6 +773,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) /* Nothing to do if the cpus didn't change */ if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); if (retval) return retval; @@ -787,62 +784,19 @@ static int update_cpumask(struct cpuset *cs, char *buf) cs->cpus_allowed = trialcs.cpus_allowed; mutex_unlock(&callback_mutex); - again: /* * Scan tasks in the cpuset, and update the cpumasks of any - * that need an update. Since we can't call set_cpus_allowed() - * while holding tasklist_lock, gather tasks to be processed - * in a heap structure. If the statically-sized heap fills up, - * overflow tasks that started later, and in future iterations - * only consider tasks that started after the latest task in - * the previous pass. This guarantees forward progress and - * that we don't miss any tasks + * that need an update. */ - heap.size = 0; - cgroup_iter_start(cgrp, &it); - while ((p = cgroup_iter_next(cgrp, &it))) { - /* Only affect tasks that don't have the right cpus_allowed */ - if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(&heap, p); - if (dropped == NULL) { - get_task_struct(p); - } else if (dropped != p) { - get_task_struct(p); - put_task_struct(dropped); - } - } - cgroup_iter_end(cgrp, &it); - if (heap.size) { - for (i = 0; i < heap.size; i++) { - struct task_struct *p = heap.ptrs[i]; - if (i == 0) { - latest_time = p->start_time; - latest_task = p; - } - set_cpus_allowed(p, cs->cpus_allowed); - put_task_struct(p); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't notice the new cpumask - * restriction. Not the most efficient way to do it, - * but it avoids having to take callback_mutex in the - * fork path - */ - goto again; - } + scan.cg = cs->css.cgroup; + scan.test_task = cpuset_test_cpumask; + scan.process_task = cpuset_change_cpumask; + scan.heap = &heap; + cgroup_scan_tasks(&scan); heap_free(&heap); + if (is_load_balanced) rebuild_sched_domains(); - return 0; } @@ -854,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf) * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding manage_mutex, so our current->cpuset won't change - * during this call, as manage_mutex holds off any attach_task() + * Call holding cgroup_mutex, so current's cpuset won't change + * during this call, as manage_mutex holds off any cpuset_attach() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing - * our tasks cpuset. + * our task's cpuset. * * Hold callback_mutex around the two modifications of our tasks * mems_allowed to synchronize with cpuset_mems_allowed(). @@ -903,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, * the cpuset is marked 'memory_migrate', migrate the tasks * pages to the new memory. * - * Call with manage_mutex held. May take callback_mutex during call. + * Call with cgroup_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1016,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) * tasklist_lock. Forks can happen again now - the mpol_copy() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global - * cpuset manage_mutex, we know that no other rebind effort will + * cgroup_mutex, we know that no other rebind effort will * be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1031,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) mmput(mm); } - /* We're done rebinding vma's to this cpusets new mems_allowed. */ + /* We're done rebinding vmas to this cpuset's new mems_allowed. */ kfree(mmarray); cpuset_being_rebound = NULL; retval = 0; @@ -1045,7 +999,7 @@ int current_cpuset_is_being_rebound(void) } /* - * Call with manage_mutex held. + * Call with cgroup_mutex held. */ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) @@ -1066,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * - * Call with manage_mutex held. + * Call with cgroup_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) @@ -1200,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk) { @@ -1547,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) * If this becomes a problem for some users who wish to * allow that scenario, then cpuset_post_clone() could be * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. + * (and likewise for mems) to the new cgroup. Called with cgroup_mutex + * held. */ static void cpuset_post_clone(struct cgroup_subsys *ss, struct cgroup *cgroup) @@ -1571,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, /* * cpuset_create - create a cpuset - * parent: cpuset that will be parent of the new cpuset. - * name: name of the new cpuset. Will be strcpy'ed. - * mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held + * ss: cpuset cgroup subsystem + * cont: control group that the new cpuset will be part of */ static struct cgroup_subsys_state *cpuset_create( @@ -1687,53 +1640,140 @@ int __init cpuset_init(void) return 0; } +/** + * cpuset_do_move_task - move a given task to another cpuset + * @tsk: pointer to task_struct the task to move + * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner + * + * Called by cgroup_scan_tasks() for each task in a cgroup. + * Return nonzero to stop the walk through the tasks. + */ +void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) +{ + struct cpuset_hotplug_scanner *chsp; + + chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); + cgroup_attach_task(chsp->to, tsk); +} + +/** + * move_member_tasks_to_cpuset - move tasks from one cpuset to another + * @from: cpuset in which the tasks currently reside + * @to: cpuset to which the tasks will be moved + * + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. + * + * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * calling callback functions for each. + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + struct cpuset_hotplug_scanner scan; + + scan.scan.cg = from->css.cgroup; + scan.scan.test_task = NULL; /* select all tasks in cgroup */ + scan.scan.process_task = cpuset_do_move_task; + scan.scan.heap = NULL; + scan.to = to->css.cgroup; + + if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) + printk(KERN_ERR "move_member_tasks_to_cpuset: " + "cgroup_scan_tasks failed\n"); +} + /* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then the guarantee_online_cpus() - * or guarantee_online_mems() code will use that emptied cpusets - * parent online CPUs or nodes. Cpusets that were already empty of - * CPUs or nodes are left empty. + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. + * + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. + */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* + * The cgroup's css_sets list is in use if there are tasks + * in the cpuset; the list is empty if there are none; + * the cs->css.refcnt seems always 0. + */ + if (list_empty(&cs->css.cgroup->css_sets)) + return; + + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = cs->parent; + while (cpus_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) + parent = parent->parent; + + move_member_tasks_to_cpuset(cs, parent); +} + +/* + * Walk the specified cpuset subtree and look for empty cpusets. + * The tasks of such cpuset must be moved to a parent cpuset. * - * This routine is intentionally inefficient in a couple of regards. - * It will check all cpusets in a subtree even if the top cpuset of - * the subtree has no offline CPUs or nodes. It checks both CPUs and - * nodes, even though the caller could have been coded to know that - * only one of CPUs or nodes needed to be checked on a given call. - * This was done to minimize text size rather than cpu cycles. + * Called with cgroup_mutex held. We take callback_mutex to modify + * cpus_allowed and mems_allowed. * - * Call with both manage_mutex and callback_mutex held. + * This walk processes the tree from top to bottom, completing one layer + * before dropping down to the next. It always processes a node before + * any of its children. * - * Recursive, on depth of cpuset subtree. + * For now, since we lack memory hot unplug, we'll never see a cpuset + * that has tasks along with an empty 'mems'. But if we did see such + * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ - -static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) +static void scan_for_empty_cpusets(const struct cpuset *root) { + struct cpuset *cp; /* scans cpusets being updated */ + struct cpuset *child; /* scans child cpusets of cp */ + struct list_head queue; struct cgroup *cont; - struct cpuset *c; - /* Each of our child cpusets mems must be online */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - c = cgroup_cs(cont); - guarantee_online_cpus_mems_in_subtree(c); - if (!cpus_empty(c->cpus_allowed)) - guarantee_online_cpus(c, &c->cpus_allowed); - if (!nodes_empty(c->mems_allowed)) - guarantee_online_mems(c, &c->mems_allowed); + INIT_LIST_HEAD(&queue); + + list_add_tail((struct list_head *)&root->stack_list, &queue); + + while (!list_empty(&queue)) { + cp = container_of(queue.next, struct cpuset, stack_list); + list_del(queue.next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &queue); + } + cont = cp->css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(&callback_mutex); + cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); + nodes_and(cp->mems_allowed, cp->mems_allowed, + node_states[N_HIGH_MEMORY]); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpus_empty(cp->cpus_allowed) || + nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); } } /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to - * track what's online after any CPU or memory node hotplug or unplug - * event. - * - * To ensure that we don't remove a CPU or node from the top cpuset - * that is currently in use by a child cpuset (which would violate - * the rule that cpusets must be subsets of their parent), we first - * call the recursive routine guarantee_online_cpus_mems_in_subtree(). + * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded @@ -1744,13 +1784,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) static void common_cpu_mem_hotplug_unplug(void) { cgroup_lock(); - mutex_lock(&callback_mutex); - guarantee_online_cpus_mems_in_subtree(&top_cpuset); top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); - mutex_unlock(&callback_mutex); cgroup_unlock(); } @@ -1826,7 +1864,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) /** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. - * Must be called with callback_mutex held. + * Must be called with callback_mutex held. **/ cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) { @@ -2163,10 +2201,8 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take manage_mutex, keeping attach_task() from changing it - * anyway. No need to check that tsk->cpuset != NULL, thanks to - * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks - * cpuset to top_cpuset. + * and we take cgroup_mutex, keeping cpuset_attach() from changing it + * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *unused_v) { @@ -2219,13 +2255,14 @@ const struct file_operations proc_cpuset_operations = { #endif /* CONFIG_PROC_PID_CPUSET */ /* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ -char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) -{ - buffer += sprintf(buffer, "Cpus_allowed:\t"); - buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed); - buffer += sprintf(buffer, "\n"); - buffer += sprintf(buffer, "Mems_allowed:\t"); - buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed); - buffer += sprintf(buffer, "\n"); - return buffer; +void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_allowed:\t"); + m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, + task->cpus_allowed); + seq_printf(m, "\n"); + seq_printf(m, "Mems_allowed:\t"); + m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, + task->mems_allowed); + seq_printf(m, "\n"); } diff --git a/kernel/exit.c b/kernel/exit.c index 9d3d0f0..506a957 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -293,26 +293,27 @@ static void reparent_to_kthreadd(void) switch_uid(INIT_USER); } -void __set_special_pids(pid_t session, pid_t pgrp) +void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; + pid_t nr = pid_nr(pid); - if (task_session_nr(curr) != session) { + if (task_session(curr) != pid) { detach_pid(curr, PIDTYPE_SID); - set_task_session(curr, session); - attach_pid(curr, PIDTYPE_SID, find_pid(session)); + attach_pid(curr, PIDTYPE_SID, pid); + set_task_session(curr, nr); } - if (task_pgrp_nr(curr) != pgrp) { + if (task_pgrp(curr) != pid) { detach_pid(curr, PIDTYPE_PGID); - set_task_pgrp(curr, pgrp); - attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); + attach_pid(curr, PIDTYPE_PGID, pid); + set_task_pgrp(curr, nr); } } -static void set_special_pids(pid_t session, pid_t pgrp) +static void set_special_pids(struct pid *pid) { write_lock_irq(&tasklist_lock); - __set_special_pids(session, pgrp); + __set_special_pids(pid); write_unlock_irq(&tasklist_lock); } @@ -383,7 +384,11 @@ void daemonize(const char *name, ...) */ current->flags |= PF_NOFREEZE; - set_special_pids(1, 1); + if (current->nsproxy != &init_nsproxy) { + get_nsproxy(&init_nsproxy); + switch_task_namespaces(current, &init_nsproxy); + } + set_special_pids(&init_struct_pid); proc_clear_tty(current); /* Block and flush all signals */ @@ -398,11 +403,6 @@ void daemonize(const char *name, ...) current->fs = fs; atomic_inc(&fs->count); - if (current->nsproxy != init_task.nsproxy) { - get_nsproxy(init_task.nsproxy); - switch_task_namespaces(current, init_task.nsproxy); - } - exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -458,7 +458,7 @@ struct files_struct *get_files_struct(struct task_struct *task) return files; } -void fastcall put_files_struct(struct files_struct *files) +void put_files_struct(struct files_struct *files) { struct fdtable *fdt; @@ -512,14 +512,10 @@ static void __put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { - dput(fs->altroot); - mntput(fs->altrootmnt); - } + path_put(&fs->root); + path_put(&fs->pwd); + if (fs->altroot.dentry) + path_put(&fs->altroot); kmem_cache_free(fs_cachep, fs); } } @@ -745,24 +741,6 @@ static void exit_notify(struct task_struct *tsk) struct task_struct *t; struct pid *pgrp; - if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) - && !thread_group_empty(tsk)) { - /* - * This occurs when there was a race between our exit - * syscall and a group signal choosing us as the one to - * wake up. It could be that we are the only thread - * alerted to check for pending signals, but another thread - * should be woken now to take the signal since we will not. - * Now we'll wake all the threads in the group just to make - * sure someone gets all the pending signals. - */ - spin_lock_irq(&tsk->sighand->siglock); - for (t = next_thread(tsk); t != tsk; t = next_thread(t)) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) - recalc_sigpending_and_wake(t); - spin_unlock_irq(&tsk->sighand->siglock); - } - /* * This does two things: * @@ -905,7 +883,7 @@ static inline void exit_child_reaper(struct task_struct *tsk) zap_pid_ns_processes(tsk->nsproxy->pid_ns); } -fastcall NORET_TYPE void do_exit(long code) +NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -947,7 +925,7 @@ fastcall NORET_TYPE void do_exit(long code) schedule(); } - tsk->flags |= PF_EXITING; + exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. @@ -1108,20 +1086,23 @@ asmlinkage void sys_exit_group(int error_code) do_group_exit((error_code & 0xff) << 8); } -static int eligible_child(pid_t pid, int options, struct task_struct *p) +static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + struct pid *pid = NULL; + if (type == PIDTYPE_PID) + pid = task->pids[type].pid; + else if (type < PIDTYPE_MAX) + pid = task->group_leader->pids[type].pid; + return pid; +} + +static int eligible_child(enum pid_type type, struct pid *pid, int options, + struct task_struct *p) { int err; - struct pid_namespace *ns; - ns = current->nsproxy->pid_ns; - if (pid > 0) { - if (task_pid_nr_ns(p, ns) != pid) - return 0; - } else if (!pid) { - if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current)) - return 0; - } else if (pid != -1) { - if (task_pgrp_nr_ns(p, ns) != -pid) + if (type < PIDTYPE_MAX) { + if (task_pid_type(p, type) != pid) return 0; } @@ -1140,18 +1121,16 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p) if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) && !(options & __WALL)) return 0; - /* - * Do not consider thread group leaders that are - * in a non-empty thread group: - */ - if (delay_group_leader(p)) - return 2; err = security_task_wait(p); - if (err) - return err; + if (likely(!err)) + return 1; - return 1; + if (type != PIDTYPE_PID) + return 0; + /* This child was explicitly requested, abort */ + read_unlock(&tasklist_lock); + return err; } static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, @@ -1191,20 +1170,13 @@ static int wait_task_zombie(struct task_struct *p, int noreap, { unsigned long state; int retval, status, traced; - struct pid_namespace *ns; - - ns = current->nsproxy->pid_ns; + pid_t pid = task_pid_vnr(p); if (unlikely(noreap)) { - pid_t pid = task_pid_nr_ns(p, ns); uid_t uid = p->uid; int exit_code = p->exit_code; int why, status; - if (unlikely(p->exit_state != EXIT_ZOMBIE)) - return 0; - if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) - return 0; get_task_struct(p); read_unlock(&tasklist_lock); if ((exit_code & 0x7f) == 0) { @@ -1315,11 +1287,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, retval = put_user(status, &infop->si_status); } if (!retval && infop) - retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid); + retval = put_user(pid, &infop->si_pid); if (!retval && infop) retval = put_user(p->uid, &infop->si_uid); if (!retval) - retval = task_pid_nr_ns(p, ns); + retval = pid; if (traced) { write_lock_irq(&tasklist_lock); @@ -1351,21 +1323,38 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, +static int wait_task_stopped(struct task_struct *p, int noreap, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { - int retval, exit_code; + int retval, exit_code, why; + uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; - if (!p->exit_code) - return 0; - if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && - p->signal->group_stop_count > 0) + exit_code = 0; + spin_lock_irq(&p->sighand->siglock); + + if (unlikely(!task_is_stopped_or_traced(p))) + goto unlock_sig; + + if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) /* * A group stop is in progress and this is the group leader. * We won't report until all threads have stopped. */ + goto unlock_sig; + + exit_code = p->exit_code; + if (!exit_code) + goto unlock_sig; + + if (!noreap) + p->exit_code = 0; + + uid = p->uid; +unlock_sig: + spin_unlock_irq(&p->sighand->siglock); + if (!exit_code) return 0; /* @@ -1375,65 +1364,15 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ - pid = task_pid_nr_ns(p, current->nsproxy->pid_ns); get_task_struct(p); + pid = task_pid_vnr(p); + why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); - if (unlikely(noreap)) { - uid_t uid = p->uid; - int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; - - exit_code = p->exit_code; - if (unlikely(!exit_code) || unlikely(p->exit_state)) - goto bail_ref; + if (unlikely(noreap)) return wait_noreap_copyout(p, pid, uid, why, exit_code, infop, ru); - } - - write_lock_irq(&tasklist_lock); - - /* - * This uses xchg to be atomic with the thread resuming and setting - * it. It must also be done with the write lock held to prevent a - * race with the EXIT_ZOMBIE case. - */ - exit_code = xchg(&p->exit_code, 0); - if (unlikely(p->exit_state)) { - /* - * The task resumed and then died. Let the next iteration - * catch it in EXIT_ZOMBIE. Note that exit_code might - * already be zero here if it resumed and did _exit(0). - * The task itself is dead and won't touch exit_code again; - * other processors in this function are locked out. - */ - p->exit_code = exit_code; - exit_code = 0; - } - if (unlikely(exit_code == 0)) { - /* - * Another thread in this function got to it first, or it - * resumed, or it resumed and then died. - */ - write_unlock_irq(&tasklist_lock); -bail_ref: - put_task_struct(p); - /* - * We are returning to the wait loop without having successfully - * removed the process and having released the lock. We cannot - * continue, since the "p" task pointer is potentially stale. - * - * Return -EAGAIN, and do_wait() will restart the loop from the - * beginning. Do _not_ re-acquire the lock. - */ - return -EAGAIN; - } - - /* move to end of parent's list to avoid starvation */ - remove_parent(p); - add_parent(p); - - write_unlock_irq(&tasklist_lock); retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; if (!retval && stat_addr) @@ -1443,15 +1382,13 @@ bail_ref: if (!retval && infop) retval = put_user(0, &infop->si_errno); if (!retval && infop) - retval = put_user((short)((p->ptrace & PT_PTRACED) - ? CLD_TRAPPED : CLD_STOPPED), - &infop->si_code); + retval = put_user(why, &infop->si_code); if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); + retval = put_user(uid, &infop->si_uid); if (!retval) retval = pid; put_task_struct(p); @@ -1473,7 +1410,6 @@ static int wait_task_continued(struct task_struct *p, int noreap, int retval; pid_t pid; uid_t uid; - struct pid_namespace *ns; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; @@ -1488,8 +1424,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, p->signal->flags &= ~SIGNAL_STOP_CONTINUED; spin_unlock_irq(&p->sighand->siglock); - ns = current->nsproxy->pid_ns; - pid = task_pid_nr_ns(p, ns); + pid = task_pid_vnr(p); uid = p->uid; get_task_struct(p); read_unlock(&tasklist_lock); @@ -1500,7 +1435,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, if (!retval && stat_addr) retval = put_user(0xffff, stat_addr); if (!retval) - retval = task_pid_nr_ns(p, ns); + retval = pid; } else { retval = wait_noreap_copyout(p, pid, uid, CLD_CONTINUED, SIGCONT, @@ -1511,103 +1446,63 @@ static int wait_task_continued(struct task_struct *p, int noreap, return retval; } - -static inline int my_ptrace_child(struct task_struct *p) -{ - if (!(p->ptrace & PT_PTRACED)) - return 0; - if (!(p->ptrace & PT_ATTACHED)) - return 1; - /* - * This child was PTRACE_ATTACH'd. We should be seeing it only if - * we are the attacher. If we are the real parent, this is a race - * inside ptrace_attach. It is waiting for the tasklist_lock, - * which we have to switch the parent links, but has already set - * the flags in p->ptrace. - */ - return (p->parent != p->real_parent); -} - -static long do_wait(pid_t pid, int options, struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) +static long do_wait(enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) { DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int flag, retval; - int allowed, denied; add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: + /* If there is nothing that can match our critier just get out */ + retval = -ECHILD; + if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) + goto end; + /* * We will set this flag if we see any child that might later * match our criteria, even if we are not able to reap it yet. */ - flag = 0; - allowed = denied = 0; + flag = retval = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; do { struct task_struct *p; - int ret; list_for_each_entry(p, &tsk->children, sibling) { - ret = eligible_child(pid, options, p); + int ret = eligible_child(type, pid, options, p); if (!ret) continue; if (unlikely(ret < 0)) { - denied = ret; - continue; - } - allowed = 1; - - if (task_is_stopped_or_traced(p)) { + retval = ret; + } else if (task_is_stopped_or_traced(p)) { /* * It's stopped now, so it might later * continue, exit, or stop again. - * - * When we hit the race with PTRACE_ATTACH, we - * will not report this child. But the race - * means it has not yet been moved to our - * ptrace_children list, so we need to set the - * flag here to avoid a spurious ECHILD when - * the race happens with the only child. */ flag = 1; + if (!(p->ptrace & PT_PTRACED) && + !(options & WUNTRACED)) + continue; - if (!my_ptrace_child(p)) { - if (task_is_traced(p)) - continue; - if (!(options & WUNTRACED)) - continue; - } - - retval = wait_task_stopped(p, ret == 2, + retval = wait_task_stopped(p, (options & WNOWAIT), infop, stat_addr, ru); - if (retval == -EAGAIN) - goto repeat; - if (retval != 0) /* He released the lock. */ - goto end; - } else if (p->exit_state == EXIT_DEAD) { - continue; - } else if (p->exit_state == EXIT_ZOMBIE) { + } else if (p->exit_state == EXIT_ZOMBIE && + !delay_group_leader(p)) { /* - * Eligible but we cannot release it yet: + * We don't reap group leaders with subthreads. */ - if (ret == 2) - goto check_continued; if (!likely(options & WEXITED)) continue; retval = wait_task_zombie(p, (options & WNOWAIT), infop, stat_addr, ru); - /* He released the lock. */ - if (retval != 0) - goto end; - } else { -check_continued: + } else if (p->exit_state != EXIT_DEAD) { /* * It's running now, so it might later * exit, stop, or stop and then continue. @@ -1618,17 +1513,20 @@ check_continued: retval = wait_task_continued(p, (options & WNOWAIT), infop, stat_addr, ru); - if (retval != 0) /* He released the lock. */ - goto end; } + if (retval != 0) /* tasklist_lock released */ + goto end; } if (!flag) { list_for_each_entry(p, &tsk->ptrace_children, - ptrace_list) { - if (!eligible_child(pid, options, p)) + ptrace_list) { + flag = eligible_child(type, pid, options, p); + if (!flag) continue; - flag = 1; - break; + if (likely(flag > 0)) + break; + retval = flag; + goto end; } } if (options & __WNOTHREAD) @@ -1636,10 +1534,9 @@ check_continued: tsk = next_thread(tsk); BUG_ON(tsk->signal != current->signal); } while (tsk != current); - read_unlock(&tasklist_lock); + if (flag) { - retval = 0; if (options & WNOHANG) goto end; retval = -ERESTARTSYS; @@ -1649,14 +1546,12 @@ check_continued: goto repeat; } retval = -ECHILD; - if (unlikely(denied) && !allowed) - retval = denied; end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); if (infop) { if (retval > 0) - retval = 0; + retval = 0; else { /* * For a WNOHANG return, clear out all the fields @@ -1680,10 +1575,12 @@ end: return retval; } -asmlinkage long sys_waitid(int which, pid_t pid, +asmlinkage long sys_waitid(int which, pid_t upid, struct siginfo __user *infop, int options, struct rusage __user *ru) { + struct pid *pid = NULL; + enum pid_type type; long ret; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) @@ -1693,37 +1590,58 @@ asmlinkage long sys_waitid(int which, pid_t pid, switch (which) { case P_ALL: - pid = -1; + type = PIDTYPE_MAX; break; case P_PID: - if (pid <= 0) + type = PIDTYPE_PID; + if (upid <= 0) return -EINVAL; break; case P_PGID: - if (pid <= 0) + type = PIDTYPE_PGID; + if (upid <= 0) return -EINVAL; - pid = -pid; break; default: return -EINVAL; } - ret = do_wait(pid, options, infop, NULL, ru); + if (type < PIDTYPE_MAX) + pid = find_get_pid(upid); + ret = do_wait(type, pid, options, infop, NULL, ru); + put_pid(pid); /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); return ret; } -asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, +asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage __user *ru) { + struct pid *pid = NULL; + enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; - ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru); + + if (upid == -1) + type = PIDTYPE_MAX; + else if (upid < 0) { + type = PIDTYPE_PGID; + pid = find_get_pid(-upid); + } else if (upid == 0) { + type = PIDTYPE_PGID; + pid = get_pid(task_pgrp(current)); + } else /* upid > 0 */ { + type = PIDTYPE_PID; + pid = find_get_pid(upid); + } + + ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); + put_pid(pid); /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); diff --git a/kernel/fork.c b/kernel/fork.c index 2b55b74..dd249c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> +#include <linux/memcontrol.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/acct.h> @@ -340,7 +341,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #include <linux/init_task.h> -static struct mm_struct * mm_init(struct mm_struct * mm) +static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -357,11 +358,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_cgroup(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } + + mm_free_cgroup(mm); free_mm(mm); return NULL; } @@ -376,7 +380,7 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm); + mm = mm_init(mm, current); } return mm; } @@ -386,10 +390,11 @@ struct mm_struct * mm_alloc(void) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -void fastcall __mmdrop(struct mm_struct *mm) +void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); + mm_free_cgroup(mm); destroy_context(mm); free_mm(mm); } @@ -511,7 +516,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; - if (!mm_init(mm)) + if (!mm_init(mm, tsk)) goto fail_nomem; if (init_new_context(tsk, mm)) @@ -595,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) rwlock_init(&fs->lock); fs->umask = old->umask; read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); + if (old->altroot.dentry) { + fs->altroot = old->altroot; + path_get(&old->altroot); } else { - fs->altrootmnt = NULL; - fs->altroot = NULL; + fs->altroot.mnt = NULL; + fs->altroot.dentry = NULL; } read_unlock(&old->lock); } @@ -904,7 +909,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->tsk = tsk; sig->it_virt_expires = cputime_zero; sig->it_virt_incr = cputime_zero; @@ -1333,6 +1337,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_NEWPID) p->nsproxy->pid_ns->child_reaper = p; + p->signal->leader_pid = pid; p->signal->tty = current->signal->tty; set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); @@ -1399,7 +1404,7 @@ fork_out: return ERR_PTR(retval); } -noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; @@ -1483,13 +1488,7 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; - /* - * this is enough to call pid_nr_ns here, but this if - * improves optimisation of regular fork() - */ - nr = (clone_flags & CLONE_NEWPID) ? - task_pid_nr_ns(p, current->nsproxy->pid_ns) : - task_pid_vnr(p); + nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1510,7 +1509,7 @@ long do_fork(unsigned long clone_flags, if (!(clone_flags & CLONE_STOPPED)) wake_up_new_task(p, clone_flags); else - p->state = TASK_STOPPED; + __set_task_state(p, TASK_STOPPED); if (unlikely (trace)) { current->ptrace_message = nr; diff --git a/kernel/futex.c b/kernel/futex.c index a6baaec..221f212 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2116,7 +2116,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, t = timespec_to_ktime(ts); if (cmd == FUTEX_WAIT) - t = ktime_add(ktime_get(), t); + t = ktime_add_safe(ktime_get(), t); tp = &t; } /* diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 133d558d..7d5e4b0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -176,7 +176,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = timespec_to_ktime(ts); if (cmd == FUTEX_WAIT) - t = ktime_add(ktime_get(), t); + t = ktime_add_safe(ktime_get(), t); tp = &t; } if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 668f396..98bee01 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -326,6 +326,23 @@ u64 ktime_divns(const ktime_t kt, s64 div) #endif /* BITS_PER_LONG >= 64 */ /* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + +/* * Check, whether the timer is on the callback pending list */ static inline int hrtimer_cb_pending(const struct hrtimer *timer) @@ -425,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer, ktime_t expires = ktime_sub(timer->expires, base->offset); int res; + WARN_ON_ONCE(timer->expires.tv64 < 0); + /* * When the callback is running, we do not reprogram the clock event * device. The timer callback is either running on a different CPU or @@ -435,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (hrtimer_callback_running(timer)) return 0; + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Nothing wrong + * about that, just avoid to call into the tick code, which + * has now objections against negative expiry values. + */ + if (expires.tv64 < 0) + return -ETIME; + if (expires.tv64 >= expires_next->tv64) return 0; @@ -682,13 +710,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) */ orun++; } - timer->expires = ktime_add(timer->expires, interval); - /* - * Make sure, that the result did not wrap with a very large - * interval. - */ - if (timer->expires.tv64 < 0) - timer->expires = ktime_set(KTIME_SEC_MAX, 0); + timer->expires = ktime_add_safe(timer->expires, interval); return orun; } @@ -839,7 +861,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) new_base = switch_hrtimer_base(timer, base); if (mode == HRTIMER_MODE_REL) { - tim = ktime_add(tim, new_base->get_time()); + tim = ktime_add_safe(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures * to signal that they simply return xtime in @@ -848,16 +870,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add(tim, base->resolution); + tim = ktime_add_safe(tim, base->resolution); #endif - /* - * Careful here: User space might have asked for a - * very long sleep, so the add above might result in a - * negative number, which enqueues the timer in front - * of the queue. - */ - if (tim.tv64 < 0) - tim.tv64 = KTIME_MAX; } timer->expires = tim; @@ -1319,13 +1333,26 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return t->task == NULL; } +static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = ktime_sub(timer->expires, timer->base->get_time()); + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; +} + long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; - struct timespec *rmtp; - ktime_t time; - - restart->fn = do_no_restart_syscall; + struct timespec __user *rmtp; hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; @@ -1333,26 +1360,22 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) if (do_nanosleep(&t, HRTIMER_MODE_ABS)) return 0; - rmtp = (struct timespec *)restart->arg1; + rmtp = (struct timespec __user *)restart->arg1; if (rmtp) { - time = ktime_sub(t.timer.expires, t.timer.base->get_time()); - if (time.tv64 <= 0) - return 0; - *rmtp = ktime_to_timespec(time); + int ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + return ret; } - restart->fn = hrtimer_nanosleep_restart; - /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; } -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp, +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; - ktime_t rem; hrtimer_init(&t.timer, clockid, mode); t.timer.expires = timespec_to_ktime(*rqtp); @@ -1364,10 +1387,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp, return -ERESTARTNOHAND; if (rmtp) { - rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); - if (rem.tv64 <= 0) - return 0; - *rmtp = ktime_to_timespec(rem); + int ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + return ret; } restart = ¤t_thread_info()->restart_block; @@ -1383,8 +1405,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp, asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) { - struct timespec tu, rmt; - int ret; + struct timespec tu; if (copy_from_user(&tu, rqtp, sizeof(tu))) return -EFAULT; @@ -1392,15 +1413,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) if (!timespec_valid(&tu)) return -EINVAL; - ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL, - CLOCK_MONOTONIC); - - if (ret && rmtp) { - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - } - - return ret; + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 44019ce..cc54c62 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -286,7 +286,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ -void fastcall +void handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; @@ -327,7 +327,7 @@ out_unlock: * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ -void fastcall +void handle_level_irq(unsigned int irq, struct irq_desc *desc) { unsigned int cpu = smp_processor_id(); @@ -375,7 +375,7 @@ out_unlock: * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ -void fastcall +void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { unsigned int cpu = smp_processor_id(); @@ -434,7 +434,7 @@ out: * the handler was running. If all pending interrupts are handled, the * loop is left. */ -void fastcall +void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { const unsigned int cpu = smp_processor_id(); @@ -505,7 +505,7 @@ out_unlock: * * Per CPU interrupts on SMP machines without locking requirements */ -void fastcall +void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; @@ -589,3 +589,39 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, set_irq_chip(irq, chip); __set_irq_handler(irq, handle, 0, name); } + +void __init set_irq_noprobe(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); + + return; + } + + desc = irq_desc + irq; + + spin_lock_irqsave(&desc->lock, flags); + desc->status |= IRQ_NOPROBE; + spin_unlock_irqrestore(&desc->lock, flags); +} + +void __init set_irq_probe(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); + + return; + } + + desc = irq_desc + irq; + + spin_lock_irqsave(&desc->lock, flags); + desc->status &= ~IRQ_NOPROBE; + spin_unlock_irqrestore(&desc->lock, flags); +} diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index dc335ad..5fa6198 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -25,7 +25,7 @@ * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void fastcall +void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); @@ -163,7 +163,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) * This is the original x86 implementation which is used for every * interrupt type. */ -fastcall unsigned int __do_IRQ(unsigned int irq) +unsigned int __do_IRQ(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; diff --git a/kernel/itimer.c b/kernel/itimer.c index 2fab344..ab98274 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -132,7 +132,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); - send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 7dadc71..f091d13 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -53,14 +53,6 @@ static inline int is_kernel_inittext(unsigned long addr) return 0; } -static inline int is_kernel_extratext(unsigned long addr) -{ - if (addr >= (unsigned long)_sextratext - && addr <= (unsigned long)_eextratext) - return 1; - return 0; -} - static inline int is_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -80,8 +72,7 @@ static int is_ksym_addr(unsigned long addr) if (all_var) return is_kernel(addr); - return is_kernel_text(addr) || is_kernel_inittext(addr) || - is_kernel_extratext(addr); + return is_kernel_text(addr) || is_kernel_inittext(addr); } /* expand a compressed symbol data into the resulting uncompressed string, diff --git a/kernel/kexec.c b/kernel/kexec.c index 9a26eec..06a0e27 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1361,8 +1361,8 @@ unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) static int __init crash_save_vmcoreinfo_init(void) { - vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release); - vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE); + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); @@ -1376,15 +1376,15 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_SPARSEMEM VMCOREINFO_SYMBOL(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_SIZE(mem_section); + VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); #endif - VMCOREINFO_SIZE(page); - VMCOREINFO_SIZE(pglist_data); - VMCOREINFO_SIZE(zone); - VMCOREINFO_SIZE(free_area); - VMCOREINFO_SIZE(list_head); - VMCOREINFO_TYPEDEF_SIZE(nodemask_t); + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); VMCOREINFO_OFFSET(page, _count); VMCOREINFO_OFFSET(page, mapping); diff --git a/kernel/kmod.c b/kernel/kmod.c index bb7df2a..22be3ff 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - retval = -EPERM; - if (current->fs->root) - retval = kernel_execve(sub_info->path, - sub_info->argv, sub_info->envp); + retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d0493ea..7a86e64 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -699,6 +699,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, struct kretprobe_instance, uflist); ri->rp = rp; ri->task = current; + + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + spin_unlock_irqrestore(&kretprobe_lock, flags); + return 0; + } + arch_prepare_kretprobe(ri, regs); /* XXX(hch): why is there no hlist_move_head? */ @@ -745,7 +751,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) INIT_HLIST_HEAD(&rp->used_instances); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { - inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); + inst = kmalloc(sizeof(struct kretprobe_instance) + + rp->data_size, GFP_KERNEL); if (inst == NULL) { free_rp_inst(rp); return -ENOMEM; diff --git a/kernel/marker.c b/kernel/marker.c index 5323cfa..c4c2cd8 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -27,35 +27,42 @@ extern struct marker __start___markers[]; extern struct marker __stop___markers[]; +/* Set to 1 to enable marker debug output */ +const int marker_debug; + /* * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers, the hash table and deferred_sync. + * and module markers and the hash table. */ static DEFINE_MUTEX(markers_mutex); /* - * Marker deferred synchronization. - * Upon marker probe_unregister, we delay call to synchronize_sched() to - * accelerate mass unregistration (only when there is no more reference to a - * given module do we call synchronize_sched()). However, we need to make sure - * every critical region has ended before we re-arm a marker that has been - * unregistered and then registered back with a different probe data. - */ -static int deferred_sync; - -/* * Marker hash table, containing the active markers. * Protected by module_mutex. */ #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +/* + * Note about RCU : + * It is used to make sure every handler has finished using its private data + * between two consecutive operation (add or remove) on a given marker. It is + * also used to delay the free of multiple probes array until a quiescent state + * is reached. + * marker entries modifications are protected by the markers_mutex. + */ struct marker_entry { struct hlist_node hlist; char *format; - marker_probe_func *probe; - void *private; + void (*call)(const struct marker *mdata, /* Probe wrapper */ + void *call_private, const char *fmt, ...); + struct marker_probe_closure single; + struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + char rcu_pending:1; + char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; /** * __mark_empty_function - Empty probe callback - * @mdata: pointer of type const struct marker + * @probe_private: probe private data + * @call_private: call site private data * @fmt: format string * @...: variable argument list * @@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; * though the function pointer change and the marker enabling are two distinct * operations that modifies the execution flow of preemptible code. */ -void __mark_empty_function(const struct marker *mdata, void *private, - const char *fmt, ...) +void __mark_empty_function(void *probe_private, void *call_private, + const char *fmt, va_list *args) { } EXPORT_SYMBOL_GPL(__mark_empty_function); /* + * marker_probe_cb Callback that prepares the variable argument list for probes. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @fmt: format string + * @...: Variable argument list. + * + * Since we do not use "typical" pointer based RCU in the 1 argument case, we + * need to put a full smp_rmb() in this branch. This is why we do not use + * rcu_dereference() for the pointer read. + */ +void marker_probe_cb(const struct marker *mdata, void *call_private, + const char *fmt, ...) +{ + va_list args; + char ptype; + + /* + * disabling preemption to make sure the teardown of the callbacks can + * be done correctly when they are in modules and they insure RCU read + * coherency. + */ + preempt_disable(); + ptype = ACCESS_ONCE(mdata->ptype); + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = ACCESS_ONCE(mdata->single.func); + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + va_start(args, fmt); + func(mdata->single.probe_private, call_private, fmt, &args); + va_end(args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = ACCESS_ONCE(mdata->multi); + for (i = 0; multi[i].func; i++) { + va_start(args, fmt); + multi[i].func(multi[i].probe_private, call_private, fmt, + &args); + va_end(args); + } + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb); + +/* + * marker_probe_cb Callback that does not prepare the variable argument list. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @fmt: format string + * @...: Variable argument list. + * + * Should be connected to markers "MARK_NOARGS". + */ +void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, const char *fmt, ...) +{ + va_list args; /* not initialized */ + char ptype; + + preempt_disable(); + ptype = ACCESS_ONCE(mdata->ptype); + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = ACCESS_ONCE(mdata->single.func); + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + func(mdata->single.probe_private, call_private, fmt, &args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = ACCESS_ONCE(mdata->multi); + for (i = 0; multi[i].func; i++) + multi[i].func(multi[i].probe_private, call_private, fmt, + &args); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); + +static void free_old_closure(struct rcu_head *head) +{ + struct marker_entry *entry = container_of(head, + struct marker_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void debug_print_probes(struct marker_entry *entry) +{ + int i; + + if (!marker_debug) + return; + + if (!entry->ptype) { + printk(KERN_DEBUG "Single probe : %p %p\n", + entry->single.func, + entry->single.probe_private); + } else { + for (i = 0; entry->multi[i].func; i++) + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, + entry->multi[i].func, + entry->multi[i].probe_private); + } +} + +static struct marker_probe_closure * +marker_entry_add_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0; + struct marker_probe_closure *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->multi; + if (!entry->ptype) { + if (entry->single.func == probe && + entry->single.probe_private == probe_private) + return ERR_PTR(-EBUSY); + if (entry->single.func == __mark_empty_function) { + /* 0 -> 1 probes */ + entry->single.func = probe; + entry->single.probe_private = probe_private; + entry->refcount = 1; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* 1 -> 2 probes */ + nr_probes = 1; + old = NULL; + } + } else { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe + && old[nr_probes].probe_private + == probe_private) + return ERR_PTR(-EBUSY); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (!old) + new[0] = entry->single; + else + memcpy(new, old, + nr_probes * sizeof(struct marker_probe_closure)); + new[nr_probes].func = probe; + new[nr_probes].probe_private = probe_private; + entry->refcount = nr_probes + 1; + entry->multi = new; + entry->ptype = 1; + debug_print_probes(entry); + return old; +} + +static struct marker_probe_closure * +marker_entry_remove_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0, nr_del = 0, i; + struct marker_probe_closure *old, *new; + + old = entry->multi; + + debug_print_probes(entry); + if (!entry->ptype) { + /* 0 -> N is an error */ + WARN_ON(entry->single.func == __mark_empty_function); + /* 1 -> 0 probes */ + WARN_ON(probe && entry->single.func != probe); + WARN_ON(entry->single.probe_private != probe_private); + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if ((!probe || old[nr_probes].func == probe) + && old[nr_probes].probe_private + == probe_private) + nr_del++; + } + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + } else if (nr_probes - nr_del == 1) { + /* N -> 1, (N > 1) */ + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + entry->single = old[i]; + entry->refcount = 1; + entry->ptype = 0; + } else { + int j = 0; + /* N -> M, (N > 1, M > 1) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(struct marker_probe_closure), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->ptype = 1; + entry->multi = new; + } + debug_print_probes(entry); + return old; +} + +/* * Get marker if the marker is present in the marker hash table. * Must be called with markers_mutex held. * Returns NULL if not present. @@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name) * Add the marker to the marker hash table. Must be called with markers_mutex * held. */ -static int add_marker(const char *name, const char *format, - marker_probe_func *probe, void *private) +static struct marker_entry *add_marker(const char *name, const char *format) { struct hlist_head *head; struct hlist_node *node; @@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format, hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { printk(KERN_NOTICE - "Marker %s busy, probe %p already installed\n", - name, e->probe); - return -EBUSY; /* Already there */ + "Marker %s busy\n", name); + return ERR_PTR(-EBUSY); /* Already there */ } } /* @@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format, e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memcpy(&e->name[0], name, name_len); if (format) { e->format = &e->name[name_len]; memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; trace_mark(core_marker_format, "name %s format %s", e->name, e->format); - } else + } else { e->format = NULL; - e->probe = probe; - e->private = private; + e->call = marker_probe_cb; + } + e->single.func = __mark_empty_function; + e->single.probe_private = NULL; + e->multi = NULL; + e->ptype = 0; e->refcount = 0; + e->rcu_pending = 0; hlist_add_head(&e->hlist, head); - return 0; + return e; } /* * Remove the marker from the marker hash table. Must be called with mutex_lock * held. */ -static void *remove_marker(const char *name) +static int remove_marker(const char *name) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; int found = 0; size_t len = strlen(name) + 1; - void *private = NULL; u32 hash = jhash(name, len-1, 0); head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; @@ -167,12 +435,16 @@ static void *remove_marker(const char *name) break; } } - if (found) { - private = e->private; - hlist_del(&e->hlist); - kfree(e); - } - return private; + if (!found) + return -ENOENT; + if (e->single.func != __mark_empty_function) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier(); + kfree(e); + return 0; } /* @@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format) size_t name_len = strlen((*entry)->name) + 1; size_t format_len = strlen(format) + 1; + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) @@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format) memcpy(&e->name[0], (*entry)->name, name_len); e->format = &e->name[name_len]; memcpy(e->format, format, format_len); - e->probe = (*entry)->probe; - e->private = (*entry)->private; + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + e->single = (*entry)->single; + e->multi = (*entry)->multi; + e->ptype = (*entry)->ptype; e->refcount = (*entry)->refcount; + e->rcu_pending = 0; hlist_add_before(&e->hlist, &(*entry)->hlist); hlist_del(&(*entry)->hlist); + /* Make sure the call_rcu has been executed */ + if ((*entry)->rcu_pending) + rcu_barrier(); kfree(*entry); *entry = e; trace_mark(core_marker_format, "name %s format %s", @@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format) /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem) +static int set_marker(struct marker_entry **entry, struct marker *elem, + int active) { int ret; WARN_ON(strcmp((*entry)->name, elem->name) != 0); @@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem) if (ret) return ret; } - elem->call = (*entry)->probe; - elem->private = (*entry)->private; - elem->state = 1; + + /* + * probe_cb setup (statically known) is done here. It is + * asynchronous with the rest of execution, therefore we only + * pass from a "safe" callback (with argument) to an "unsafe" + * callback (does not set arguments). + */ + elem->call = (*entry)->call; + /* + * Sanity check : + * We only update the single probe private data when the ptr is + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) + */ + WARN_ON(elem->single.func != __mark_empty_function + && elem->single.probe_private + != (*entry)->single.probe_private && + !elem->ptype); + elem->single.probe_private = (*entry)->single.probe_private; + /* + * Make sure the private data is valid when we update the + * single probe ptr. + */ + smp_wmb(); + elem->single.func = (*entry)->single.func; + /* + * We also make sure that the new probe callbacks array is consistent + * before setting a pointer to it. + */ + rcu_assign_pointer(elem->multi, (*entry)->multi); + /* + * Update the function or multi probe array pointer before setting the + * ptype. + */ + smp_wmb(); + elem->ptype = (*entry)->ptype; + elem->state = active; + return 0; } @@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem) */ static void disable_marker(struct marker *elem) { + /* leave "call" as is. It is known statically. */ elem->state = 0; - elem->call = __mark_empty_function; + elem->single.func = __mark_empty_function; + /* Update the function before setting the ptype */ + smp_wmb(); + elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and * should be done only after a synchronize_sched(). These are never used @@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem) * marker_update_probe_range - Update a probe range * @begin: beginning of the range * @end: end of the range - * @probe_module: module address of the probe being updated - * @refcount: number of references left to the given probe_module (out) * * Updates the probe callback corresponding to a range of markers. */ void marker_update_probe_range(struct marker *begin, - struct marker *end, struct module *probe_module, - int *refcount) + struct marker *end) { struct marker *iter; struct marker_entry *mark_entry; @@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin, mutex_lock(&markers_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); - if (mark_entry && mark_entry->refcount) { - set_marker(&mark_entry, iter); + if (mark_entry) { + set_marker(&mark_entry, iter, + !!mark_entry->refcount); /* * ignore error, continue */ - if (probe_module) - if (probe_module == - __module_text_address((unsigned long)mark_entry->probe)) - (*refcount)++; } else { disable_marker(iter); } @@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin, * Issues a synchronize_sched() when no reference to the module passed * as parameter is found in the probes so the probe module can be * safely unloaded from now on. + * + * Internal callback only changed before the first probe is connected to it. + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 + * transitions. All other transitions will leave the old private data valid. + * This makes the non-atomicity of the callback/private data updates valid. + * + * "special case" updates : + * 0 -> 1 callback + * 1 -> 0 callback + * 1 -> 2 callbacks + * 2 -> 1 callbacks + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. + * Site effect : marker_set_format may delete the marker entry (creating a + * replacement). */ -static void marker_update_probes(struct module *probe_module) +static void marker_update_probes(void) { - int refcount = 0; - /* Core kernel markers */ - marker_update_probe_range(__start___markers, - __stop___markers, probe_module, &refcount); + marker_update_probe_range(__start___markers, __stop___markers); /* Markers in modules. */ - module_update_markers(probe_module, &refcount); - if (probe_module && refcount == 0) { - synchronize_sched(); - deferred_sync = 0; - } + module_update_markers(); } /** @@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module) * @name: marker name * @format: format string * @probe: probe handler - * @private: probe private data + * @probe_private: probe private data * * private data must be a valid allocated memory address, or NULL. * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. */ int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *private) + marker_probe_func *probe, void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); entry = get_marker(name); - if (entry && entry->refcount) { - ret = -EBUSY; - goto end; - } - if (deferred_sync) { - synchronize_sched(); - deferred_sync = 0; + if (!entry) { + entry = add_marker(name, format); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } } - ret = add_marker(name, format, probe, private); - if (ret) + /* + * If we detect that a call_rcu is pending for this marker, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_add_probe(entry, probe, probe_private); + if (IS_ERR(old)) { + ret = PTR_ERR(old); goto end; + } mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); return ret; @@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register); /** * marker_probe_unregister - Disconnect a probe from a marker * @name: marker name + * @probe: probe function pointer + * @probe_private: probe private data * * Returns the private data given to marker_probe_register, or an ERR_PTR(). + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -void *marker_probe_unregister(const char *name) +int marker_probe_unregister(const char *name, + marker_probe_func *probe, void *probe_private) { - struct module *probe_module; struct marker_entry *entry; - void *private; + struct marker_probe_closure *old; + int ret = 0; mutex_lock(&markers_mutex); entry = get_marker(name); if (!entry) { - private = ERR_PTR(-ENOENT); + ret = -ENOENT; goto end; } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(name); - deferred_sync = 1; + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); + remove_marker(name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); - return private; + return ret; } EXPORT_SYMBOL_GPL(marker_probe_unregister); -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @private: probe private data - * - * Unregister a marker by providing the registered private data. - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - */ -void *marker_probe_unregister_private_data(void *private) +static struct marker_entry * +get_marker_from_private_data(marker_probe_func *probe, void *probe_private) { - struct module *probe_module; - struct hlist_head *head; - struct hlist_node *node; struct marker_entry *entry; - int found = 0; unsigned int i; + struct hlist_head *head; + struct hlist_node *node; - mutex_lock(&markers_mutex); for (i = 0; i < MARKER_TABLE_SIZE; i++) { head = &marker_table[i]; hlist_for_each_entry(entry, node, head, hlist) { - if (entry->private == private) { - found = 1; - goto iter_end; + if (!entry->ptype) { + if (entry->single.func == probe + && entry->single.probe_private + == probe_private) + return entry; + } else { + struct marker_probe_closure *closure; + closure = entry->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func == probe && + closure[i].probe_private + == probe_private) + return entry; + } } } } -iter_end: - if (!found) { - private = ERR_PTR(-ENOENT); - goto end; - } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(entry->name); - deferred_sync = 1; - mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; -end: - mutex_unlock(&markers_mutex); - return private; + return NULL; } -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** - * marker_arm - Arm a marker - * @name: marker name + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @probe: probe function + * @probe_private: probe private data * - * Activate a marker. It keeps a reference count of the number of - * arming/disarming done. - * Returns 0 if ok, error value on error. + * Unregister a probe by providing the registered private data. + * Only removes the first marker found in hash table. + * Return 0 on success or error value. + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -int marker_arm(const char *name) +int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); - entry = get_marker(name); + entry = get_marker_from_private_data(probe, probe_private); if (!entry) { ret = -ENOENT; goto end; } - /* - * Only need to update probes when refcount passes from 0 to 1. - */ - if (entry->refcount++) - goto end; -end: + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; -} -EXPORT_SYMBOL_GPL(marker_arm); - -/** - * marker_disarm - Disarm a marker - * @name: marker name - * - * Disarm a marker. It keeps a reference count of the number of arming/disarming - * done. - * Returns 0 if ok, error value on error. - */ -int marker_disarm(const char *name) -{ - struct marker_entry *entry; - int ret = 0; - + marker_update_probes(); /* may update entry */ mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - ret = -ENOENT; - goto end; - } - /* - * Only permit decrement refcount if higher than 0. - * Do probe update only on 1 -> 0 transition. - */ - if (entry->refcount) { - if (--entry->refcount) - goto end; - } else { - ret = -EPERM; - goto end; - } + entry = get_marker_from_private_data(probe, probe_private); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); + remove_marker(entry->name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); - marker_update_probes(NULL); return ret; } -EXPORT_SYMBOL_GPL(marker_disarm); +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** * marker_get_private_data - Get a marker's probe private data * @name: marker name + * @probe: probe to match + * @num: get the nth matching probe's private data * + * Returns the nth private data pointer (starting from 0) matching, or an + * ERR_PTR. * Returns the private data pointer, or an ERR_PTR. * The private data pointer should _only_ be dereferenced if the caller is the * owner of the data, or its content could vanish. This is mostly used to * confirm that a caller is the owner of a registered probe. */ -void *marker_get_private_data(const char *name) +void *marker_get_private_data(const char *name, marker_probe_func *probe, + int num) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; size_t name_len = strlen(name) + 1; u32 hash = jhash(name, name_len-1, 0); - int found = 0; + int i; head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { - found = 1; - return e->private; + if (!e->ptype) { + if (num == 0 && e->single.func == probe) + return e->single.probe_private; + else + break; + } else { + struct marker_probe_closure *closure; + int match = 0; + closure = e->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func != probe) + continue; + if (match++ == num) + return closure[i].probe_private; + } + } } } return ERR_PTR(-ENOENT); diff --git a/kernel/module.c b/kernel/module.c index bd60278e..92595ba 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -46,6 +46,7 @@ #include <asm/semaphore.h> #include <asm/cacheflush.h> #include <linux/license.h> +#include <asm/sections.h> #if 0 #define DEBUGP printk @@ -290,7 +291,7 @@ static unsigned long __find_symbol(const char *name, } } DEBUGP("Failed to find symbol %s\n", name); - return 0; + return -ENOENT; } /* Search for module by name: must hold module_mutex. */ @@ -343,9 +344,6 @@ static inline unsigned int block_size(int val) return val; } -/* Created by linker magic */ -extern char __per_cpu_start[], __per_cpu_end[]; - static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -783,7 +781,7 @@ void __symbol_put(const char *symbol) const unsigned long *crc; preempt_disable(); - if (!__find_symbol(symbol, &owner, &crc, 1)) + if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1))) BUG(); module_put(owner); preempt_enable(); @@ -929,7 +927,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, const unsigned long *crc; struct module *owner; - if (!__find_symbol("struct_module", &owner, &crc, 1)) + if (IS_ERR_VALUE(__find_symbol("struct_module", + &owner, &crc, 1))) BUG(); return check_version(sechdrs, versindex, "struct_module", mod, crc); @@ -978,12 +977,12 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, ret = __find_symbol(name, &owner, &crc, !(mod->taints & TAINT_PROPRIETARY_MODULE)); - if (ret) { + if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ if (!check_version(sechdrs, versindex, name, mod, crc) || !use_module(mod, owner)) - ret = 0; + ret = -EINVAL; } return ret; } @@ -1371,7 +1370,9 @@ void *__symbol_get(const char *symbol) preempt_disable(); value = __find_symbol(symbol, &owner, &crc, 1); - if (value && strong_try_module_get(owner) != 0) + if (IS_ERR_VALUE(value)) + value = 0; + else if (strong_try_module_get(owner)) value = 0; preempt_enable(); @@ -1391,14 +1392,16 @@ static int verify_export_symbols(struct module *mod) const unsigned long *crc; for (i = 0; i < mod->num_syms; i++) - if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { + if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name, + &owner, &crc, 1))) { name = mod->syms[i].name; ret = -ENOEXEC; goto dup; } for (i = 0; i < mod->num_gpl_syms; i++) - if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { + if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name, + &owner, &crc, 1))) { name = mod->gpl_syms[i].name; ret = -ENOEXEC; goto dup; @@ -1448,7 +1451,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, strtab + sym[i].st_name, mod); /* Ok if resolved. */ - if (sym[i].st_value != 0) + if (!IS_ERR_VALUE(sym[i].st_value)) break; /* Ok if weak. */ if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) @@ -2035,7 +2038,7 @@ static struct module *load_module(void __user *umod, #ifdef CONFIG_MARKERS if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, NULL, NULL); + mod->markers + mod->num_markers); #endif err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2250,7 +2253,7 @@ static const char *get_ksymbol(struct module *mod, /* For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ -char *module_address_lookup(unsigned long addr, +const char *module_address_lookup(unsigned long addr, unsigned long *size, unsigned long *offset, char **modname, @@ -2275,7 +2278,7 @@ char *module_address_lookup(unsigned long addr, ret = namebuf; } preempt_enable(); - return (char *)ret; + return ret; } int lookup_module_symbol_name(unsigned long addr, char *symname) @@ -2561,7 +2564,7 @@ EXPORT_SYMBOL(struct_module); #endif #ifdef CONFIG_MARKERS -void module_update_markers(struct module *probe_module, int *refcount) +void module_update_markers(void) { struct module *mod; @@ -2569,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount) list_for_each_entry(mod, &modules, list) if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, - probe_module, refcount); + mod->markers + mod->num_markers); mutex_unlock(&module_mutex); } #endif diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index d17436c..3aaa06c 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -107,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * use of the mutex is forbidden. The mutex must not be locked when * this function is called. */ -void fastcall mutex_destroy(struct mutex *lock) +void mutex_destroy(struct mutex *lock) { DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); lock->magic = NULL; diff --git a/kernel/mutex.c b/kernel/mutex.c index d9ec9b6..d046a34 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static void fastcall noinline __sched +static void noinline __sched __mutex_lock_slowpath(atomic_t *lock_count); /*** @@ -82,7 +82,7 @@ __mutex_lock_slowpath(atomic_t *lock_count); * * This function is similar to (but not equivalent to) down(). */ -void inline fastcall __sched mutex_lock(struct mutex *lock) +void inline __sched mutex_lock(struct mutex *lock) { might_sleep(); /* @@ -95,8 +95,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -static void fastcall noinline __sched -__mutex_unlock_slowpath(atomic_t *lock_count); +static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /*** * mutex_unlock - release the mutex @@ -109,7 +108,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count); * * This function is similar to (but not equivalent to) up(). */ -void fastcall __sched mutex_unlock(struct mutex *lock) +void __sched mutex_unlock(struct mutex *lock) { /* * The unlocking fastpath is the 0->1 transition from 'locked' @@ -234,7 +233,7 @@ EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); /* * Release the lock, slowpath: */ -static fastcall inline void +static inline void __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) { struct mutex *lock = container_of(lock_count, struct mutex, count); @@ -271,7 +270,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) /* * Release the lock, slowpath: */ -static fastcall noinline void +static noinline void __mutex_unlock_slowpath(atomic_t *lock_count) { __mutex_unlock_common_slowpath(lock_count, 1); @@ -282,10 +281,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count) * Here come the less common (and hence less performance-critical) APIs: * mutex_lock_interruptible() and mutex_trylock(). */ -static int fastcall noinline __sched +static noinline int __sched __mutex_lock_killable_slowpath(atomic_t *lock_count); -static noinline int fastcall __sched +static noinline int __sched __mutex_lock_interruptible_slowpath(atomic_t *lock_count); /*** @@ -299,7 +298,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count); * * This function is similar to (but not equivalent to) down_interruptible(). */ -int fastcall __sched mutex_lock_interruptible(struct mutex *lock) +int __sched mutex_lock_interruptible(struct mutex *lock) { might_sleep(); return __mutex_fastpath_lock_retval @@ -308,7 +307,7 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) EXPORT_SYMBOL(mutex_lock_interruptible); -int fastcall __sched mutex_lock_killable(struct mutex *lock) +int __sched mutex_lock_killable(struct mutex *lock) { might_sleep(); return __mutex_fastpath_lock_retval @@ -316,7 +315,7 @@ int fastcall __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); -static void fastcall noinline __sched +static noinline void __sched __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); @@ -324,7 +323,7 @@ __mutex_lock_slowpath(atomic_t *lock_count) __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); } -static int fastcall noinline __sched +static noinline int __sched __mutex_lock_killable_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); @@ -332,7 +331,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count) return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); } -static noinline int fastcall __sched +static noinline int __sched __mutex_lock_interruptible_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); @@ -381,7 +380,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) * This function must not be used in interrupt context. The * mutex must be released by the same task that acquired it. */ -int fastcall __sched mutex_trylock(struct mutex *lock) +int __sched mutex_trylock(struct mutex *lock) { return __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); diff --git a/kernel/notifier.c b/kernel/notifier.c index 4253f47..643360d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -4,6 +4,7 @@ #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> +#include <linux/reboot.h> /* * Notifier list for kernel code which wants to be called diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 79f871b..f5d332c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -21,6 +21,7 @@ #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <net/net_namespace.h> +#include <linux/ipc_namespace.h> static struct kmem_cache *nsproxy_cachep; diff --git a/kernel/panic.c b/kernel/panic.c index d9e90cf..24af9f8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -161,7 +161,7 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', @@ -169,7 +169,8 @@ const char *print_tainted(void) tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', tainted & TAINT_BAD_PAGE ? 'B' : ' ', tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' '); + tainted & TAINT_DIE ? 'D' : ' ', + tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); diff --git a/kernel/params.c b/kernel/params.c index 42fe5e6..afc46a2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -180,12 +180,12 @@ int parse_args(const char *name, #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ int param_set_##name(const char *val, struct kernel_param *kp) \ { \ - char *endp; \ tmptype l; \ + int ret; \ \ if (!val) return -EINVAL; \ - l = strtolfn(val, &endp, 0); \ - if (endp == val || ((type)l != l)) \ + ret = strtolfn(val, 0, &l); \ + if (ret == -EINVAL || ((type)l != l)) \ return -EINVAL; \ *((type *)kp->arg) = l; \ return 0; \ @@ -195,13 +195,13 @@ int parse_args(const char *name, return sprintf(buffer, format, *((type *)kp->arg)); \ } -STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); int param_set_charp(const char *val, struct kernel_param *kp) { @@ -272,7 +272,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, struct kernel_param *kp), - int *num) + unsigned int *num) { int ret; struct kernel_param kp; diff --git a/kernel/pid.c b/kernel/pid.c index f815455..4776915 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,7 +41,6 @@ static struct hlist_head *pid_hash; static int pidhash_shift; struct pid init_struct_pid = INIT_STRUCT_PID; -static struct kmem_cache *pid_ns_cachep; int pid_max = PID_MAX_DEFAULT; @@ -112,7 +111,7 @@ EXPORT_SYMBOL(is_container_init); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) +static void free_pidmap(struct pid_namespace *pid_ns, int pid) { struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; @@ -181,7 +180,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) return -1; } -static int next_pidmap(struct pid_namespace *pid_ns, int last) +int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; struct pidmap *map, *end; @@ -199,7 +198,7 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last) return -1; } -fastcall void put_pid(struct pid *pid) +void put_pid(struct pid *pid) { struct pid_namespace *ns; @@ -221,7 +220,7 @@ static void delayed_put_pid(struct rcu_head *rhp) put_pid(pid); } -fastcall void free_pid(struct pid *pid) +void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; @@ -287,7 +286,7 @@ out_free: goto out; } -struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns) +struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { struct hlist_node *elem; struct upid *pnr; @@ -317,7 +316,7 @@ EXPORT_SYMBOL_GPL(find_pid); /* * attach_pid() must be called with the tasklist_lock write-held. */ -int fastcall attach_pid(struct task_struct *task, enum pid_type type, +int attach_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { struct pid_link *link; @@ -329,7 +328,7 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type, return 0; } -void fastcall detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct task_struct *task, enum pid_type type) { struct pid_link *link; struct pid *pid; @@ -349,7 +348,7 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type) } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ -void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, +void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { new->pids[type].pid = old->pids[type].pid; @@ -357,7 +356,7 @@ void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, old->pids[type].pid = NULL; } -struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) +struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; if (pid) { @@ -368,6 +367,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) } return result; } +EXPORT_SYMBOL(pid_task); /* * Must be called under rcu_read_lock() or with tasklist_lock read-held. @@ -408,7 +408,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) return pid; } -struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) +struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; rcu_read_lock(); @@ -443,6 +443,12 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) return nr; } +pid_t pid_vnr(struct pid *pid) +{ + return pid_nr_ns(pid, current->nsproxy->pid_ns); +} +EXPORT_SYMBOL_GPL(pid_vnr); + pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return pid_nr_ns(task_pid(tsk), ns); @@ -487,180 +493,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) } EXPORT_SYMBOL_GPL(find_get_pid); -struct pid_cache { - int nr_ids; - char name[16]; - struct kmem_cache *cachep; - struct list_head list; -}; - -static LIST_HEAD(pid_caches_lh); -static DEFINE_MUTEX(pid_caches_mutex); - -/* - * creates the kmem cache to allocate pids from. - * @nr_ids: the number of numerical ids this pid will have to carry - */ - -static struct kmem_cache *create_pid_cachep(int nr_ids) -{ - struct pid_cache *pcache; - struct kmem_cache *cachep; - - mutex_lock(&pid_caches_mutex); - list_for_each_entry (pcache, &pid_caches_lh, list) - if (pcache->nr_ids == nr_ids) - goto out; - - pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); - if (pcache == NULL) - goto err_alloc; - - snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); - cachep = kmem_cache_create(pcache->name, - sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (cachep == NULL) - goto err_cachep; - - pcache->nr_ids = nr_ids; - pcache->cachep = cachep; - list_add(&pcache->list, &pid_caches_lh); -out: - mutex_unlock(&pid_caches_mutex); - return pcache->cachep; - -err_cachep: - kfree(pcache); -err_alloc: - mutex_unlock(&pid_caches_mutex); - return NULL; -} - -#ifdef CONFIG_PID_NS -static struct pid_namespace *create_pid_namespace(int level) -{ - struct pid_namespace *ns; - int i; - - ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL); - if (ns == NULL) - goto out; - - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; - - ns->pid_cachep = create_pid_cachep(level + 1); - if (ns->pid_cachep == NULL) - goto out_free_map; - - kref_init(&ns->kref); - ns->last_pid = 0; - ns->child_reaper = NULL; - ns->level = level; - - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) { - ns->pidmap[i].page = 0; - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - } - - return ns; - -out_free_map: - kfree(ns->pidmap[0].page); -out_free: - kmem_cache_free(pid_ns_cachep, ns); -out: - return ERR_PTR(-ENOMEM); -} - -static void destroy_pid_namespace(struct pid_namespace *ns) -{ - int i; - - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); - kmem_cache_free(pid_ns_cachep, ns); -} - -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) -{ - struct pid_namespace *new_ns; - - BUG_ON(!old_ns); - new_ns = get_pid_ns(old_ns); - if (!(flags & CLONE_NEWPID)) - goto out; - - new_ns = ERR_PTR(-EINVAL); - if (flags & CLONE_THREAD) - goto out_put; - - new_ns = create_pid_namespace(old_ns->level + 1); - if (!IS_ERR(new_ns)) - new_ns->parent = get_pid_ns(old_ns); - -out_put: - put_pid_ns(old_ns); -out: - return new_ns; -} - -void free_pid_ns(struct kref *kref) -{ - struct pid_namespace *ns, *parent; - - ns = container_of(kref, struct pid_namespace, kref); - - parent = ns->parent; - destroy_pid_namespace(ns); - - if (parent != NULL) - put_pid_ns(parent); -} -#endif /* CONFIG_PID_NS */ - -void zap_pid_ns_processes(struct pid_namespace *pid_ns) -{ - int nr; - int rc; - - /* - * The last thread in the cgroup-init thread group is terminating. - * Find remaining pid_ts in the namespace, signal and wait for them - * to exit. - * - * Note: This signals each threads in the namespace - even those that - * belong to the same thread group, To avoid this, we would have - * to walk the entire tasklist looking a processes in this - * namespace, but that could be unnecessarily expensive if the - * pid namespace has just a few processes. Or we need to - * maintain a tasklist for each pid namespace. - * - */ - read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { - kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); - nr = next_pidmap(pid_ns, nr); - } - read_unlock(&tasklist_lock); - - do { - clear_thread_flag(TIF_SIGPENDING); - rc = sys_wait4(-1, NULL, __WALL, NULL); - } while (rc != -ECHILD); - - - /* Child reaper for the pid namespace is going away */ - pid_ns->child_reaper = NULL; - return; -} - /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or @@ -693,9 +525,6 @@ void __init pidmap_init(void) set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - init_pid_ns.pid_cachep = create_pid_cachep(1); - if (init_pid_ns.pid_cachep == NULL) - panic("Can't create pid_1 cachep\n"); - - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + init_pid_ns.pid_cachep = KMEM_CACHE(pid, + SLAB_HWCACHE_ALIGN | SLAB_PANIC); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c new file mode 100644 index 0000000..6d792b6 --- /dev/null +++ b/kernel/pid_namespace.c @@ -0,0 +1,197 @@ +/* + * Pid namespaces + * + * Authors: + * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. + * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM + * Many thanks to Oleg Nesterov for comments and help + * + */ + +#include <linux/pid.h> +#include <linux/pid_namespace.h> +#include <linux/syscalls.h> +#include <linux/err.h> + +#define BITS_PER_PAGE (PAGE_SIZE*8) + +struct pid_cache { + int nr_ids; + char name[16]; + struct kmem_cache *cachep; + struct list_head list; +}; + +static LIST_HEAD(pid_caches_lh); +static DEFINE_MUTEX(pid_caches_mutex); +static struct kmem_cache *pid_ns_cachep; + +/* + * creates the kmem cache to allocate pids from. + * @nr_ids: the number of numerical ids this pid will have to carry + */ + +static struct kmem_cache *create_pid_cachep(int nr_ids) +{ + struct pid_cache *pcache; + struct kmem_cache *cachep; + + mutex_lock(&pid_caches_mutex); + list_for_each_entry(pcache, &pid_caches_lh, list) + if (pcache->nr_ids == nr_ids) + goto out; + + pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); + if (pcache == NULL) + goto err_alloc; + + snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); + cachep = kmem_cache_create(pcache->name, + sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (cachep == NULL) + goto err_cachep; + + pcache->nr_ids = nr_ids; + pcache->cachep = cachep; + list_add(&pcache->list, &pid_caches_lh); +out: + mutex_unlock(&pid_caches_mutex); + return pcache->cachep; + +err_cachep: + kfree(pcache); +err_alloc: + mutex_unlock(&pid_caches_mutex); + return NULL; +} + +static struct pid_namespace *create_pid_namespace(int level) +{ + struct pid_namespace *ns; + int i; + + ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL); + if (ns == NULL) + goto out; + + ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!ns->pidmap[0].page) + goto out_free; + + ns->pid_cachep = create_pid_cachep(level + 1); + if (ns->pid_cachep == NULL) + goto out_free_map; + + kref_init(&ns->kref); + ns->last_pid = 0; + ns->child_reaper = NULL; + ns->level = level; + + set_bit(0, ns->pidmap[0].page); + atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); + + for (i = 1; i < PIDMAP_ENTRIES; i++) { + ns->pidmap[i].page = 0; + atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + } + + return ns; + +out_free_map: + kfree(ns->pidmap[0].page); +out_free: + kmem_cache_free(pid_ns_cachep, ns); +out: + return ERR_PTR(-ENOMEM); +} + +static void destroy_pid_namespace(struct pid_namespace *ns) +{ + int i; + + for (i = 0; i < PIDMAP_ENTRIES; i++) + kfree(ns->pidmap[i].page); + kmem_cache_free(pid_ns_cachep, ns); +} + +struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +{ + struct pid_namespace *new_ns; + + BUG_ON(!old_ns); + new_ns = get_pid_ns(old_ns); + if (!(flags & CLONE_NEWPID)) + goto out; + + new_ns = ERR_PTR(-EINVAL); + if (flags & CLONE_THREAD) + goto out_put; + + new_ns = create_pid_namespace(old_ns->level + 1); + if (!IS_ERR(new_ns)) + new_ns->parent = get_pid_ns(old_ns); + +out_put: + put_pid_ns(old_ns); +out: + return new_ns; +} + +void free_pid_ns(struct kref *kref) +{ + struct pid_namespace *ns, *parent; + + ns = container_of(kref, struct pid_namespace, kref); + + parent = ns->parent; + destroy_pid_namespace(ns); + + if (parent != NULL) + put_pid_ns(parent); +} + +void zap_pid_ns_processes(struct pid_namespace *pid_ns) +{ + int nr; + int rc; + + /* + * The last thread in the cgroup-init thread group is terminating. + * Find remaining pid_ts in the namespace, signal and wait for them + * to exit. + * + * Note: This signals each threads in the namespace - even those that + * belong to the same thread group, To avoid this, we would have + * to walk the entire tasklist looking a processes in this + * namespace, but that could be unnecessarily expensive if the + * pid namespace has just a few processes. Or we need to + * maintain a tasklist for each pid namespace. + * + */ + read_lock(&tasklist_lock); + nr = next_pidmap(pid_ns, 1); + while (nr > 0) { + kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); + nr = next_pidmap(pid_ns, nr); + } + read_unlock(&tasklist_lock); + + do { + clear_thread_flag(TIF_SIGPENDING); + rc = sys_wait4(-1, NULL, __WALL, NULL); + } while (rc != -ECHILD); + + + /* Child reaper for the pid namespace is going away */ + pid_ns->child_reaper = NULL; + return; +} + +static __init int pid_namespaces_init(void) +{ + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + return 0; +} + +__initcall(pid_namespaces_init); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 0b7c82ac..2eae91f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -20,7 +20,7 @@ static int check_clock(const clockid_t which_clock) return 0; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? same_thread_group(p, current) : thread_group_leader(p))) { error = -EINVAL; @@ -305,7 +305,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) */ struct task_struct *p; rcu_read_lock(); - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); if (p) { if (CPUCLOCK_PERTHREAD(which_clock)) { if (same_thread_group(p, current)) { @@ -354,7 +354,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) if (pid == 0) { p = current; } else { - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); if (p && !same_thread_group(p, current)) p = NULL; } @@ -362,7 +362,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) if (pid == 0) { p = current->group_leader; } else { - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); if (p && !thread_group_leader(p)) p = NULL; } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 122d5c7..a9b0420 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -404,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event) struct task_struct *rtn = current->group_leader; if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || + (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || !same_thread_group(rtn, current) || (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) return NULL; @@ -767,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags, /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) - timer->expires = ktime_add(timer->expires, - timer->base->get_time()); + if (mode == HRTIMER_MODE_REL) { + timer->expires = + ktime_add_safe(timer->expires, + timer->base->get_time()); + } return 0; } @@ -982,20 +984,9 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) static int common_nsleep(const clockid_t which_clock, int flags, struct timespec *tsave, struct timespec __user *rmtp) { - struct timespec rmt; - int ret; - - ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL, - flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); - - if (ret && rmtp) { - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - } - - return ret; + return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } asmlinkage long diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ef9b802..7983317 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -74,8 +74,8 @@ config PM_TRACE_RTC RTC across reboots, so that you can debug a machine that just hangs during suspend (or more commonly, during resume). - To use this debugging feature you should attempt to suspend the machine, - then reboot it, then run + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run dmesg -s 1000000 | grep 'hash matches' @@ -123,7 +123,10 @@ config HIBERNATION called "hibernation" in user interfaces. STD checkpoints the system and powers it off; and restores that checkpoint on reboot. - You can suspend your machine with 'echo disk > /sys/power/state'. + You can suspend your machine with 'echo disk > /sys/power/state' + after placing resume=/dev/swappartition on the kernel command line + in your bootloader's configuration file. + Alternatively, you can use the additional userland tools available from <http://suspend.sf.net>. diff --git a/kernel/printk.c b/kernel/printk.c index 29ae1e99..bee3610 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -32,7 +32,6 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/syscalls.h> -#include <linux/jiffies.h> #include <asm/uaccess.h> @@ -93,16 +92,16 @@ static int console_locked, console_suspended; */ static DEFINE_SPINLOCK(logbuf_lock); -#define LOG_BUF_MASK (log_buf_len-1) +#define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) /* * The indices into log_buf are not constrained to log_buf_len - they * must be masked before subscripting */ -static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */ -static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */ -static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */ +static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ +static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ +static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ /* * Array of consoles built from command line options (console=) @@ -128,17 +127,17 @@ static int console_may_schedule; static char __log_buf[__LOG_BUF_LEN]; static char *log_buf = __log_buf; static int log_buf_len = __LOG_BUF_LEN; -static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ +static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ static int __init log_buf_len_setup(char *str) { - unsigned long size = memparse(str, &str); + unsigned size = memparse(str, &str); unsigned long flags; if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) { - unsigned long start, dest_idx, offset; + unsigned start, dest_idx, offset; char *new_log_buf; new_log_buf = alloc_bootmem(size); @@ -295,7 +294,7 @@ int log_buf_read(int idx) */ int do_syslog(int type, char __user *buf, int len) { - unsigned long i, j, limit, count; + unsigned i, j, limit, count; int do_clear = 0; char c; int error = 0; @@ -436,7 +435,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len) /* * Call the console drivers on a range of log_buf */ -static void __call_console_drivers(unsigned long start, unsigned long end) +static void __call_console_drivers(unsigned start, unsigned end) { struct console *con; @@ -463,8 +462,8 @@ early_param("ignore_loglevel", ignore_loglevel_setup); /* * Write out chars from start to end - 1 inclusive */ -static void _call_console_drivers(unsigned long start, - unsigned long end, int msg_log_level) +static void _call_console_drivers(unsigned start, + unsigned end, int msg_log_level) { if ((msg_log_level < console_loglevel || ignore_loglevel) && console_drivers && start != end) { @@ -484,12 +483,12 @@ static void _call_console_drivers(unsigned long start, * log_buf[start] to log_buf[end - 1]. * The console_sem must be held. */ -static void call_console_drivers(unsigned long start, unsigned long end) +static void call_console_drivers(unsigned start, unsigned end) { - unsigned long cur_index, start_print; + unsigned cur_index, start_print; static int msg_level = -1; - BUG_ON(((long)(start - end)) > 0); + BUG_ON(((int)(start - end)) > 0); cur_index = start; start_print = start; @@ -567,19 +566,6 @@ static int printk_time = 0; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); -static int __init printk_time_setup(char *str) -{ - if (*str) - return 0; - printk_time = 1; - printk(KERN_NOTICE "The 'time' option is deprecated and " - "is scheduled for removal in early 2008\n"); - printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n"); - return 1; -} - -__setup("time", printk_time_setup); - /* Check if we have any console registered that can be called early in boot. */ static int have_callable_console(void) { @@ -790,7 +776,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len) return -ENOSYS; } -static void call_console_drivers(unsigned long start, unsigned long end) +static void call_console_drivers(unsigned start, unsigned end) { } @@ -983,8 +969,8 @@ void wake_up_klogd(void) void release_console_sem(void) { unsigned long flags; - unsigned long _con_start, _log_end; - unsigned long wake_klogd = 0; + unsigned _con_start, _log_end; + unsigned wake_klogd = 0; if (console_suspended) { up(&secondary_console_sem); @@ -1265,6 +1251,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) return; } +#if defined CONFIG_PRINTK /* * printk rate limiting, lifted from the networking subsystem. * @@ -1275,7 +1262,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { static DEFINE_SPINLOCK(ratelimit_lock); - static unsigned long toks = 10 * 5 * HZ; + static unsigned toks = 10 * 5 * HZ; static unsigned long last_msg; static int missed; unsigned long flags; @@ -1334,3 +1321,4 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } EXPORT_SYMBOL(printk_timed_ratelimit); +#endif diff --git a/kernel/profile.c b/kernel/profile.c index e64c2da..3b7a1b0 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -20,7 +20,6 @@ #include <linux/mm.h> #include <linux/cpumask.h> #include <linux/cpu.h> -#include <linux/profile.h> #include <linux/highmem.h> #include <linux/mutex.h> #include <asm/sections.h> diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b0d4ab4..fdb34e8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -20,6 +20,7 @@ #include <linux/signal.h> #include <linux/audit.h> #include <linux/pid_namespace.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/uaccess.h> @@ -53,7 +54,7 @@ void ptrace_untrace(struct task_struct *child) spin_lock(&child->sighand->siglock); if (task_is_traced(child)) { if (child->signal->flags & SIGNAL_STOP_STOPPED) { - child->state = TASK_STOPPED; + __set_task_state(child, TASK_STOPPED); } else { signal_wake_up(child, 1); } @@ -98,23 +99,23 @@ int ptrace_check_attach(struct task_struct *child, int kill) * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current && - (!(child->ptrace & PT_ATTACHED) || child->real_parent != current) - && child->signal != NULL) { + if ((child->ptrace & PT_PTRACED) && child->parent == current) { ret = 0; + /* + * child->sighand can't be NULL, release_task() + * does ptrace_unlink() before __exit_signal(). + */ spin_lock_irq(&child->sighand->siglock); - if (task_is_stopped(child)) { + if (task_is_stopped(child)) child->state = TASK_TRACED; - } else if (!task_is_traced(child) && !kill) { + else if (!task_is_traced(child) && !kill) ret = -ESRCH; - } spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !kill) { + if (!ret && !kill) wait_task_inactive(child); - } /* All systems go.. */ return ret; @@ -201,8 +202,7 @@ repeat: goto bad; /* Go */ - task->ptrace |= PT_PTRACED | ((task->real_parent != current) - ? PT_ATTACHED : 0); + task->ptrace |= PT_PTRACED; if (capable(CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 760dfc2..c09605f 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -/* Because of FASTCALL declaration of complete, we use this wrapper */ +/* + * Awaken the corresponding synchronize_rcu() instance now that a + * grace period has elapsed. + */ static void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; diff --git a/kernel/relay.c b/kernel/relay.c index 7c03733..d080b9d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -37,37 +37,31 @@ static void relay_file_mmap_close(struct vm_area_struct *vma) } /* - * nopage() vm_op implementation for relay file mapping. + * fault() vm_op implementation for relay file mapping. */ -static struct page *relay_buf_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) +static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page; struct rchan_buf *buf = vma->vm_private_data; - unsigned long offset = address - vma->vm_start; + pgoff_t pgoff = vmf->pgoff; - if (address > vma->vm_end) - return NOPAGE_SIGBUS; /* Disallow mremap */ if (!buf) - return NOPAGE_OOM; + return VM_FAULT_OOM; - page = vmalloc_to_page(buf->start + offset); + page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT)); if (!page) - return NOPAGE_OOM; + return VM_FAULT_SIGBUS; get_page(page); + vmf->page = page; - if (type) - *type = VM_FAULT_MINOR; - - return page; + return 0; } /* * vm_ops for relay file mappings. */ static struct vm_operations_struct relay_file_mmap_ops = { - .nopage = relay_buf_nopage, + .fault = relay_buf_fault, .close = relay_file_mmap_close, }; diff --git a/kernel/res_counter.c b/kernel/res_counter.c new file mode 100644 index 0000000..16cbec2 --- /dev/null +++ b/kernel/res_counter.c @@ -0,0 +1,134 @@ +/* + * resource cgroups + * + * Copyright 2007 OpenVZ SWsoft Inc + * + * Author: Pavel Emelianov <xemul@openvz.org> + * + */ + +#include <linux/types.h> +#include <linux/parser.h> +#include <linux/fs.h> +#include <linux/res_counter.h> +#include <linux/uaccess.h> + +void res_counter_init(struct res_counter *counter) +{ + spin_lock_init(&counter->lock); + counter->limit = (unsigned long long)LLONG_MAX; +} + +int res_counter_charge_locked(struct res_counter *counter, unsigned long val) +{ + if (counter->usage + val > counter->limit) { + counter->failcnt++; + return -ENOMEM; + } + + counter->usage += val; + return 0; +} + +int res_counter_charge(struct res_counter *counter, unsigned long val) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&counter->lock, flags); + ret = res_counter_charge_locked(counter, val); + spin_unlock_irqrestore(&counter->lock, flags); + return ret; +} + +void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; +} + +void res_counter_uncharge(struct res_counter *counter, unsigned long val) +{ + unsigned long flags; + + spin_lock_irqsave(&counter->lock, flags); + res_counter_uncharge_locked(counter, val); + spin_unlock_irqrestore(&counter->lock, flags); +} + + +static inline unsigned long long * +res_counter_member(struct res_counter *counter, int member) +{ + switch (member) { + case RES_USAGE: + return &counter->usage; + case RES_LIMIT: + return &counter->limit; + case RES_FAILCNT: + return &counter->failcnt; + }; + + BUG(); + return NULL; +} + +ssize_t res_counter_read(struct res_counter *counter, int member, + const char __user *userbuf, size_t nbytes, loff_t *pos, + int (*read_strategy)(unsigned long long val, char *st_buf)) +{ + unsigned long long *val; + char buf[64], *s; + + s = buf; + val = res_counter_member(counter, member); + if (read_strategy) + s += read_strategy(*val, s); + else + s += sprintf(s, "%llu\n", *val); + return simple_read_from_buffer((void __user *)userbuf, nbytes, + pos, buf, s - buf); +} + +ssize_t res_counter_write(struct res_counter *counter, int member, + const char __user *userbuf, size_t nbytes, loff_t *pos, + int (*write_strategy)(char *st_buf, unsigned long long *val)) +{ + int ret; + char *buf, *end; + unsigned long flags; + unsigned long long tmp, *val; + + buf = kmalloc(nbytes + 1, GFP_KERNEL); + ret = -ENOMEM; + if (buf == NULL) + goto out; + + buf[nbytes] = '\0'; + ret = -EFAULT; + if (copy_from_user(buf, userbuf, nbytes)) + goto out_free; + + ret = -EINVAL; + + if (write_strategy) { + if (write_strategy(buf, &tmp)) { + goto out_free; + } + } else { + tmp = simple_strtoull(buf, &end, 10); + if (*end != '\0') + goto out_free; + } + spin_lock_irqsave(&counter->lock, flags); + val = res_counter_member(counter, member); + *val = tmp; + spin_unlock_irqrestore(&counter->lock, flags); + ret = nbytes; +out_free: + kfree(buf); +out: + return ret; +} diff --git a/kernel/resource.c b/kernel/resource.c index 2eb553d..82aea81 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -228,7 +228,7 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* * Finds the lowest memory reosurce exists within [res->start.res->end) * the caller must specify res->start, res->end, res->flags. diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 56d73cb..5fcb4fe 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -130,7 +130,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, task = rt_mutex_owner(act_waiter->lock); if (task && task != current) { - act_waiter->deadlock_task_pid = task->pid; + act_waiter->deadlock_task_pid = get_pid(task_pid(task)); act_waiter->deadlock_lock = lock; } } @@ -142,9 +142,12 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) if (!waiter->deadlock_lock || !rt_trace_on) return; - task = find_task_by_pid(waiter->deadlock_task_pid); - if (!task) + rcu_read_lock(); + task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); + if (!task) { + rcu_read_unlock(); return; + } TRACE_OFF_NOLOCK(); @@ -173,6 +176,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) current->comm, task_pid_nr(current)); dump_stack(); debug_show_all_locks(); + rcu_read_unlock(); printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); @@ -203,10 +207,12 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) memset(waiter, 0x11, sizeof(*waiter)); plist_node_init(&waiter->list_entry, MAX_PRIO); plist_node_init(&waiter->pi_list_entry, MAX_PRIO); + waiter->deadlock_task_pid = NULL; } void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { + put_pid(waiter->deadlock_task_pid); TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); TRACE_WARN_ON(waiter->task); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 0deef71..6522ae5 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) + if (unlikely(timeout)) { hrtimer_start(&timeout->timer, timeout->timer.expires, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } for (;;) { /* Try to acquire the lock: */ diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 2d3b835..e124bf5 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -51,7 +51,7 @@ struct rt_mutex_waiter { struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES unsigned long ip; - pid_t deadlock_task_pid; + struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif }; diff --git a/kernel/sched.c b/kernel/sched.c index 9474b23..f28f19e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -155,7 +155,7 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED #include <linux/cgroup.h> @@ -165,19 +165,16 @@ static LIST_HEAD(task_groups); /* task group related information */ struct task_group { -#ifdef CONFIG_FAIR_CGROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css; #endif + +#ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - unsigned int rt_ratio; - /* * shares assigned to a task group governs how much of cpu bandwidth * is allocated to the group. The more shares a group has, the more is @@ -213,33 +210,46 @@ struct task_group { * */ unsigned long shares; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + u64 rt_runtime; +#endif struct rcu_head rcu; struct list_head list; }; +#ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif -/* task_group_mutex serializes add/remove of task groups and also changes to +/* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. */ -static DEFINE_MUTEX(task_group_mutex); +static DEFINE_SPINLOCK(task_group_lock); /* doms_cur_mutex serializes access to doms_cur[] array */ static DEFINE_MUTEX(doms_cur_mutex); +#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* kernel thread that runs rebalance_shares() periodically */ static struct task_struct *lb_monitor_task; @@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused); static void set_se_shares(struct sched_entity *se, unsigned long shares); +#ifdef CONFIG_USER_SCHED +# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) +#else +# define INIT_TASK_GROUP_LOAD NICE_0_LOAD +#endif + +#define MIN_GROUP_SHARES 2 + +static int init_task_group_load = INIT_TASK_GROUP_LOAD; +#endif + /* Default task group. * Every task in system belong to this group at bootup. */ struct task_group init_task_group = { +#ifdef CONFIG_FAIR_GROUP_SCHED .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, +#endif +#ifdef CONFIG_RT_GROUP_SCHED .rt_se = init_sched_rt_entity_p, .rt_rq = init_rt_rq_p, -}; - -#ifdef CONFIG_FAIR_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif - -#define MIN_GROUP_SHARES 2 - -static int init_task_group_load = INIT_TASK_GROUP_LOAD; +}; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) { struct task_group *tg; -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED tg = p->user->tg; -#elif defined(CONFIG_FAIR_CGROUP_SCHED) +#elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else @@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { +#ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; +#endif +#ifdef CONFIG_RT_GROUP_SCHED p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; -} - -static inline void lock_task_group_list(void) -{ - mutex_lock(&task_group_mutex); -} - -static inline void unlock_task_group_list(void) -{ - mutex_unlock(&task_group_mutex); +#endif } static inline void lock_doms_cur(void) @@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void) #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_task_group_list(void) { } -static inline void unlock_task_group_list(void) { } static inline void lock_doms_cur(void) { } static inline void unlock_doms_cur(void) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -363,7 +370,7 @@ struct cfs_rq { struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED int highest_prio; /* highest queued rt task prio */ #endif #ifdef CONFIG_SMP @@ -373,7 +380,9 @@ struct rt_rq { int rt_throttled; u64 rt_time; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + struct rq *rq; struct list_head leaf_rt_rq_list; struct task_group *tg; @@ -447,6 +456,8 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif @@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features = const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we measure -rt task cpu usage in ms. + * period over which we measure -rt task cpu usage in us. * default: 1s */ -const_debug unsigned int sysctl_sched_rt_period = 1000; +unsigned int sysctl_sched_rt_period = 1000000; -#define SCHED_RT_FRAC_SHIFT 16 -#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; /* - * ratio of time -rt tasks may consume. - * default: 95% + * single value that denotes runtime == period, ie unlimited time. */ -const_debug unsigned int sysctl_sched_rt_ratio = 62259; +#define RUNTIME_INF ((u64)~0ULL) /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu @@ -1893,13 +1906,13 @@ out: return success; } -int fastcall wake_up_process(struct task_struct *p) +int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); } EXPORT_SYMBOL(wake_up_process); -int fastcall wake_up_state(struct task_struct *p, unsigned int state) +int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); } @@ -1986,7 +1999,7 @@ void sched_fork(struct task_struct *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; @@ -3753,7 +3766,7 @@ void scheduler_tick(void) #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) -void fastcall add_preempt_count(int val) +void add_preempt_count(int val) { /* * Underflow? @@ -3769,7 +3782,7 @@ void fastcall add_preempt_count(int val) } EXPORT_SYMBOL(add_preempt_count); -void fastcall sub_preempt_count(int val) +void sub_preempt_count(int val) { /* * Underflow? @@ -4067,7 +4080,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function */ -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; @@ -4081,7 +4094,7 @@ EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) { __wake_up_common(q, mode, 1, 0, NULL); } @@ -4099,7 +4112,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void fastcall +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; @@ -4571,6 +4584,15 @@ recheck: return -EPERM; } +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + return -EPERM; +#endif + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED rt_rq->highest_prio = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP @@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; #endif } @@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); se->parent = NULL; } +#endif +#ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, int add) @@ -7175,7 +7200,7 @@ void __init sched_init(void) init_defrootdomain(); #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); #endif @@ -7196,7 +7221,10 @@ void __init sched_init(void) &per_cpu(init_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1); - init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ +#endif +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_runtime = + sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), @@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irq(&tasklist_lock); + read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { /* * Only normalize user tasks: @@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void) continue; } - spin_lock_irqsave(&p->pi_lock, flags); + spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + spin_unlock(&p->pi_lock); } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + read_unlock_irqrestore(&tasklist_lock, flags); } #endif /* CONFIG_MAGIC_SYSRQ */ @@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED -#ifdef CONFIG_SMP +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP /* * distribute shares of all task groups among their schedulable entities, * to reflect load distribution across cpus. @@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused) } #endif /* CONFIG_SMP */ -static void free_sched_group(struct task_group *tg) +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) { int i; @@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg) kfree(tg->cfs_rq[i]); if (tg->se) kfree(tg->se[i]); - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); } kfree(tg->cfs_rq); kfree(tg->se); - kfree(tg->rt_rq); - kfree(tg->rt_se); - kfree(tg); } -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +static int alloc_fair_sched_group(struct task_group *tg) { - struct task_group *tg; struct cfs_rq *cfs_rq; struct sched_entity *se; - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; struct rq *rq; int i; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); - if (!tg) - return ERR_PTR(-ENOMEM); - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); if (!tg->cfs_rq) goto err; tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); if (!tg->se) goto err; - tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); - if (!tg->rt_se) - goto err; tg->shares = NICE_0_LOAD; - tg->rt_ratio = 0; /* XXX */ for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void) if (!se) goto err; + init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + } + + return 1; + + err: + return 0; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, + &cpu_rq(cpu)->leaf_cfs_rq_list); +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); +} +#else +static inline void free_fair_sched_group(struct task_group *tg) +{ +} + +static inline int alloc_fair_sched_group(struct task_group *tg) +{ + return 1; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +static int alloc_rt_sched_group(struct task_group *tg) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + struct rq *rq; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + tg->rt_runtime = 0; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + rt_rq = kmalloc_node(sizeof(struct rt_rq), GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!rt_rq) @@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void) if (!rt_se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); } - lock_task_group_list(); + return 1; + + err: + return 0; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, + &cpu_rq(cpu)->leaf_rt_rq_list); +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); +} +#else +static inline void free_rt_sched_group(struct task_group *tg) +{ +} + +static inline int alloc_rt_sched_group(struct task_group *tg) +{ + return 1; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ +} +#endif + +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); + kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(void) +{ + struct task_group *tg; + unsigned long flags; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg)) + goto err; + + if (!alloc_rt_sched_group(tg)) + goto err; + + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - rt_rq = tg->rt_rq[i]; - list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); + register_fair_sched_group(tg, i); + register_rt_sched_group(tg, i); } list_add_rcu(&tg->list, &task_groups); - unlock_task_group_list(); + spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { - struct cfs_rq *cfs_rq = NULL; - struct rt_rq *rt_rq = NULL; + unsigned long flags; int i; - lock_task_group_list(); + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - rt_rq = tg->rt_rq[i]; - list_del_rcu(&rt_rq->leaf_rt_rq_list); + unregister_fair_sched_group(tg, i); + unregister_rt_sched_group(tg, i); } list_del_rcu(&tg->list); - unlock_task_group_list(); - - BUG_ON(!cfs_rq); + spin_unlock_irqrestore(&task_group_lock, flags); /* wait for possible concurrent references to cfs_rqs complete */ call_rcu(&tg->rcu, free_sched_group_rcu); @@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } +#ifdef CONFIG_FAIR_GROUP_SCHED /* rq->lock to be locked by caller */ static void set_se_shares(struct sched_entity *se, unsigned long shares) { @@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) } } +static DEFINE_MUTEX(shares_mutex); + int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - struct cfs_rq *cfs_rq; - struct rq *rq; + unsigned long flags; - lock_task_group_list(); + mutex_lock(&shares_mutex); if (tg->shares == shares) goto done; @@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * load_balance_fair) from referring to this group first, * by taking it off the rq->leaf_cfs_rq_list on each cpu. */ - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - } + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); + spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ synchronize_sched(); @@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * Enable load balance activity on this group, by inserting it back on * each cpu's rq->leaf_cfs_rq_list. */ - for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - } + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + register_fair_sched_group(tg, i); + spin_unlock_irqrestore(&task_group_lock, flags); done: - unlock_task_group_list(); + mutex_unlock(&shares_mutex); return 0; } @@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg) { return tg->shares; } +#endif +#ifdef CONFIG_RT_GROUP_SCHED /* - * Ensure the total rt_ratio <= sysctl_sched_rt_ratio + * Ensure that the real time constraints are schedulable. */ -int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 16; + + runtime *= (1ULL << 16); + div64_64(runtime, period); + return runtime; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi; unsigned long total = 0; + unsigned long global_ratio = + to_ratio(sysctl_sched_rt_period, + sysctl_sched_rt_runtime < 0 ? + RUNTIME_INF : sysctl_sched_rt_runtime); rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) - total += tgi->rt_ratio; - rcu_read_unlock(); + list_for_each_entry_rcu(tgi, &task_groups, list) { + if (tgi == tg) + continue; - if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) - return -EINVAL; + total += to_ratio(period, tgi->rt_runtime); + } + rcu_read_unlock(); - tg->rt_ratio = rt_ratio; - return 0; + return total + to_ratio(period, runtime) < global_ratio; } -unsigned long sched_group_rt_ratio(struct task_group *tg) +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { - return tg->rt_ratio; + u64 rt_runtime, rt_period; + int err = 0; + + rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us == -1) + rt_runtime = rt_period; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(tg, rt_period, rt_runtime)) { + err = -EINVAL; + goto unlock; + } + if (rt_runtime_us == -1) + rt_runtime = RUNTIME_INF; + tg->rt_runtime = rt_runtime; + unlock: + mutex_unlock(&rt_constraints_mutex); + + return err; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} +#endif +#endif /* CONFIG_GROUP_SCHED */ -#ifdef CONFIG_FAIR_CGROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) @@ -7857,9 +8037,15 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { +#ifdef CONFIG_RT_GROUP_SCHED + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + return -EINVAL; +#else /* We don't support RT-tasks being in separate groups */ if (tsk->sched_class != &fair_sched_class) return -EINVAL; +#endif return 0; } @@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, sched_move_task(tsk); } +#ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { @@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } +#endif -static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_ratio_val) +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { - return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); + char buffer[64]; + int retval = 0; + s64 val; + char *end; + + if (!nbytes) + return -EINVAL; + if (nbytes >= sizeof(buffer)) + return -E2BIG; + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + + /* strip newline if necessary */ + if (nbytes && (buffer[nbytes-1] == '\n')) + buffer[nbytes-1] = 0; + val = simple_strtoll(buffer, &end, 0); + if (*end) + return -EINVAL; + + /* Pass to subsystem */ + retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + if (!retval) + retval = nbytes; + return retval; } -static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) +static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct task_group *tg = cgroup_tg(cgrp); + char tmp[64]; + long val = sched_group_rt_runtime(cgroup_tg(cgrp)); + int len = sprintf(tmp, "%ld\n", val); - return (u64) tg->rt_ratio; + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } +#endif static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED { - .name = "rt_ratio", - .read_uint = cpu_rt_ratio_read_uint, - .write_uint = cpu_rt_ratio_write_uint, + .name = "rt_runtime_us", + .read = cpu_rt_runtime_read, + .write = cpu_rt_runtime_write, }, +#endif }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) @@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .early_init = 1, }; -#endif /* CONFIG_FAIR_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_CGROUP_CPUACCT diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 274b40d..f54792b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return !list_empty(&rt_se->run_list); } -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED -static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { if (!rt_rq->tg) - return SCHED_RT_FRAC; + return RUNTIME_INF; - return rt_rq->tg->rt_ratio; + return rt_rq->tg->rt_runtime; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se); static void dequeue_rt_entity(struct sched_rt_entity *rt_se); -static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) +static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct sched_rt_entity *rt_se = rt_rq->rt_se; @@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) } } -static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) +static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { struct sched_rt_entity *rt_se = rt_rq->rt_se; @@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + +static int rt_se_boosted(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = group_rt_rq(rt_se); + struct task_struct *p; + + if (rt_rq) + return !!rt_rq->rt_nr_boosted; + + p = rt_task_of(rt_se); + return p->prio != p->normal_prio; +} + #else -static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - return sysctl_sched_rt_ratio; + if (sysctl_sched_rt_runtime == -1) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return NULL; } -static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { } -static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq) @@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } -static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) +static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { - unsigned int rt_ratio = sched_rt_ratio(rt_rq); - u64 period, ratio; + u64 runtime = sched_rt_runtime(rt_rq); - if (rt_ratio == SCHED_RT_FRAC) + if (runtime == RUNTIME_INF) return 0; if (rt_rq->rt_throttled) - return 1; - - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; - ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; + return rt_rq_throttled(rt_rq); - if (rt_rq->rt_time > ratio) { + if (rt_rq->rt_time > runtime) { struct rq *rq = rq_of_rt_rq(rt_rq); rq->rt_throttled = 1; rt_rq->rt_throttled = 1; - sched_rt_ratio_dequeue(rt_rq); - return 1; + if (rt_rq_throttled(rt_rq)) { + sched_rt_rq_dequeue(rt_rq); + return 1; + } } return 0; @@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq) u64 period; while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; + period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; rq->rt_period_expire += period; for_each_leaf_rt_rq(rt_rq, rq) { - unsigned long rt_ratio = sched_rt_ratio(rt_rq); - u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; + u64 runtime = sched_rt_runtime(rt_rq); - rt_rq->rt_time -= min(rt_rq->rt_time, ratio); - if (rt_rq->rt_throttled) { + rt_rq->rt_time -= min(rt_rq->rt_time, runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { rt_rq->rt_throttled = 0; - sched_rt_ratio_enqueue(rt_rq); + sched_rt_rq_enqueue(rt_rq); } } @@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq) cpuacct_charge(curr, delta_exec); rt_rq->rt_time += delta_exec; - /* - * might make it a tad more accurate: - * - * update_sched_rt_period(rq); - */ - if (sched_rt_ratio_exceeded(rt_rq)) + if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); } @@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_se_prio(rt_se) < rt_rq->highest_prio) rt_rq->highest_prio = rt_se_prio(rt_se); #endif @@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif +#ifdef CONFIG_RT_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted++; +#endif } static inline @@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_rq->rt_nr_running) { struct rt_prio_array *array; @@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ +#ifdef CONFIG_RT_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted--; + + WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); +#endif } static void enqueue_rt_entity(struct sched_rt_entity *rt_se) @@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); - if (group_rq && group_rq->rt_throttled) + if (group_rq && rt_rq_throttled(group_rq)) return; list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); @@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (unlikely(!rt_rq->rt_nr_running)) return NULL; - if (sched_rt_ratio_exceeded(rt_rq)) + if (rt_rq_throttled(rt_rq)) return NULL; do { diff --git a/kernel/signal.c b/kernel/signal.c index 6a5f97c..84917fe 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p) } } -int fastcall __fatal_signal_pending(struct task_struct *tsk) +int __fatal_signal_pending(struct task_struct *tsk) { return sigismember(&tsk->pending.signal, SIGKILL); } @@ -1018,7 +1018,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) } /* - * kill_pgrp_info() sends a signal to a process group: this is what the tty + * __kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) */ @@ -1037,30 +1037,28 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) return success ? 0 : retval; } -int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) -{ - int retval; - - read_lock(&tasklist_lock); - retval = __kill_pgrp_info(sig, info, pgrp); - read_unlock(&tasklist_lock); - - return retval; -} - int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { - int error; + int error = -ESRCH; struct task_struct *p; rcu_read_lock(); if (unlikely(sig_needs_tasklist(sig))) read_lock(&tasklist_lock); +retry: p = pid_task(pid, PIDTYPE_PID); - error = -ESRCH; - if (p) + if (p) { error = group_send_sig_info(sig, info, p); + if (unlikely(error == -ESRCH)) + /* + * The task was unhashed in between, try again. + * If it is dead, pid_task() will return NULL, + * if we race with de_thread() it will find the + * new leader. + */ + goto retry; + } if (unlikely(sig_needs_tasklist(sig))) read_unlock(&tasklist_lock); @@ -1125,14 +1123,22 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); static int kill_something_info(int sig, struct siginfo *info, int pid) { int ret; - rcu_read_lock(); - if (!pid) { - ret = kill_pgrp_info(sig, info, task_pgrp(current)); - } else if (pid == -1) { + + if (pid > 0) { + rcu_read_lock(); + ret = kill_pid_info(sig, info, find_vpid(pid)); + rcu_read_unlock(); + return ret; + } + + read_lock(&tasklist_lock); + if (pid != -1) { + ret = __kill_pgrp_info(sig, info, + pid ? find_vpid(-pid) : task_pgrp(current)); + } else { int retval = 0, count = 0; struct task_struct * p; - read_lock(&tasklist_lock); for_each_process(p) { if (p->pid > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p); @@ -1141,14 +1147,10 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) retval = err; } } - read_unlock(&tasklist_lock); ret = count ? retval : -ESRCH; - } else if (pid < 0) { - ret = kill_pgrp_info(sig, info, find_vpid(-pid)); - } else { - ret = kill_pid_info(sig, info, find_vpid(pid)); } - rcu_read_unlock(); + read_unlock(&tasklist_lock); + return ret; } @@ -1196,20 +1198,6 @@ send_sig(int sig, struct task_struct *p, int priv) return send_sig_info(sig, __si_special(priv), p); } -/* - * This is the entry point for "process-wide" signals. - * They will go to an appropriate thread in the thread group. - */ -int -send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - int ret; - read_lock(&tasklist_lock); - ret = group_send_sig_info(sig, info, p); - read_unlock(&tasklist_lock); - return ret; -} - void force_sig(int sig, struct task_struct *p) { @@ -1237,7 +1225,13 @@ force_sigsegv(int sig, struct task_struct *p) int kill_pgrp(struct pid *pid, int sig, int priv) { - return kill_pgrp_info(sig, __si_special(priv), pid); + int ret; + + read_lock(&tasklist_lock); + ret = __kill_pgrp_info(sig, __si_special(priv), pid); + read_unlock(&tasklist_lock); + + return ret; } EXPORT_SYMBOL(kill_pgrp); @@ -1556,11 +1550,6 @@ static inline int may_ptrace_stop(void) { if (!likely(current->ptrace & PT_PTRACED)) return 0; - - if (unlikely(current->parent == current->real_parent && - (current->ptrace & PT_ATTACHED))) - return 0; - /* * Are we in the middle of do_coredump? * If so and our tracer is also part of the coredump stopping @@ -1578,6 +1567,17 @@ static inline int may_ptrace_stop(void) } /* + * Return nonzero if there is a SIGKILL that should be waking us up. + * Called with the siglock held. + */ +static int sigkill_pending(struct task_struct *tsk) +{ + return ((sigismember(&tsk->pending.signal, SIGKILL) || + sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) && + !unlikely(sigismember(&tsk->blocked, SIGKILL))); +} + +/* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. @@ -1585,11 +1585,31 @@ static inline int may_ptrace_stop(void) * That makes it a way to test a stopped process for * being ptrace-stopped vs being job-control-stopped. * - * If we actually decide not to stop at all because the tracer is gone, - * we leave nostop_code in current->exit_code. + * If we actually decide not to stop at all because the tracer + * is gone, we keep current->exit_code unless clear_code. */ -static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) +static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) { + int killed = 0; + + if (arch_ptrace_stop_needed(exit_code, info)) { + /* + * The arch code has something special to do before a + * ptrace stop. This is allowed to block, e.g. for faults + * on user stack pages. We can't keep the siglock while + * calling arch_ptrace_stop, so we must release it now. + * To preserve proper semantics, we must do this before + * any signal bookkeeping like checking group_stop_count. + * Meanwhile, a SIGKILL could come in before we retake the + * siglock. That must prevent us from sleeping in TASK_TRACED. + * So after regaining the lock, we must check for SIGKILL. + */ + spin_unlock_irq(¤t->sighand->siglock); + arch_ptrace_stop(exit_code, info); + spin_lock_irq(¤t->sighand->siglock); + killed = sigkill_pending(current); + } + /* * If there is a group stop in progress, * we must participate in the bookkeeping. @@ -1601,22 +1621,23 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) current->exit_code = exit_code; /* Let the debugger run. */ - set_current_state(TASK_TRACED); + __set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); try_to_freeze(); read_lock(&tasklist_lock); - if (may_ptrace_stop()) { + if (!unlikely(killed) && may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); } else { /* * By the time we got the lock, our tracer went away. - * Don't stop here. + * Don't drop the lock yet, another tracer may come. */ + __set_current_state(TASK_RUNNING); + if (clear_code) + current->exit_code = 0; read_unlock(&tasklist_lock); - set_current_state(TASK_RUNNING); - current->exit_code = nostop_code; } /* @@ -1649,7 +1670,7 @@ void ptrace_notify(int exit_code) /* Let the debugger run. */ spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, 0, &info); + ptrace_stop(exit_code, 1, &info); spin_unlock_irq(¤t->sighand->siglock); } @@ -1712,7 +1733,7 @@ static int do_signal_stop(int signr) * stop is always done with the siglock held, * so this check has no races. */ - if (!t->exit_state && + if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { stop_count++; signal_wake_up(t, 0); @@ -1756,7 +1777,7 @@ relock: ptrace_signal_deliver(regs, cookie); /* Let the debugger run. */ - ptrace_stop(signr, signr, info); + ptrace_stop(signr, 0, info); /* We're back. Did the debugger cancel the sig? */ signr = current->exit_code; @@ -1873,6 +1894,48 @@ relock: return signr; } +void exit_signals(struct task_struct *tsk) +{ + int group_stop = 0; + struct task_struct *t; + + if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { + tsk->flags |= PF_EXITING; + return; + } + + spin_lock_irq(&tsk->sighand->siglock); + /* + * From now this task is not visible for group-wide signals, + * see wants_signal(), do_signal_stop(). + */ + tsk->flags |= PF_EXITING; + if (!signal_pending(tsk)) + goto out; + + /* It could be that __group_complete_signal() choose us to + * notify about group-wide signal. Another thread should be + * woken now to take the signal since we will not. + */ + for (t = tsk; (t = next_thread(t)) != tsk; ) + if (!signal_pending(t) && !(t->flags & PF_EXITING)) + recalc_sigpending_and_wake(t); + + if (unlikely(tsk->signal->group_stop_count) && + !--tsk->signal->group_stop_count) { + tsk->signal->flags = SIGNAL_STOP_STOPPED; + group_stop = 1; + } +out: + spin_unlock_irq(&tsk->sighand->siglock); + + if (unlikely(group_stop)) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(tsk, CLD_STOPPED); + read_unlock(&tasklist_lock); + } +} + EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); diff --git a/kernel/softirq.c b/kernel/softirq.c index d7837d4..5b3aea5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -320,7 +320,7 @@ void irq_exit(void) /* * This function must run with irqs disabled! */ -inline fastcall void raise_softirq_irqoff(unsigned int nr) +inline void raise_softirq_irqoff(unsigned int nr) { __raise_softirq_irqoff(nr); @@ -337,7 +337,7 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr) wakeup_softirqd(); } -void fastcall raise_softirq(unsigned int nr) +void raise_softirq(unsigned int nr) { unsigned long flags; @@ -363,7 +363,7 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; -void fastcall __tasklet_schedule(struct tasklet_struct *t) +void __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; @@ -376,7 +376,7 @@ void fastcall __tasklet_schedule(struct tasklet_struct *t) EXPORT_SYMBOL(__tasklet_schedule); -void fastcall __tasklet_hi_schedule(struct tasklet_struct *t) +void __tasklet_hi_schedule(struct tasklet_struct *t) { unsigned long flags; diff --git a/kernel/srcu.c b/kernel/srcu.c index 3507cabe..b0aeeaf 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -74,7 +74,7 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -int srcu_readers_active(struct srcu_struct *sp) +static int srcu_readers_active(struct srcu_struct *sp) { return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); } @@ -255,4 +255,3 @@ EXPORT_SYMBOL_GPL(srcu_read_lock); EXPORT_SYMBOL_GPL(srcu_read_unlock); EXPORT_SYMBOL_GPL(synchronize_srcu); EXPORT_SYMBOL_GPL(srcu_batches_completed); -EXPORT_SYMBOL_GPL(srcu_readers_active); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 51b5ee5..6f4e0e1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -29,7 +29,6 @@ enum stopmachine_state { static enum stopmachine_state stopmachine_state; static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; -static DECLARE_MUTEX(stopmachine_mutex); static int stopmachine(void *cpu) { @@ -170,6 +169,7 @@ static int do_stop(void *_smdata) struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { + static DEFINE_MUTEX(stopmachine_mutex); struct stop_machine_data smdata; struct task_struct *p; @@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, smdata.data = data; init_completion(&smdata.done); - down(&stopmachine_mutex); + mutex_lock(&stopmachine_mutex); /* If they don't care which CPU fn runs on, bind to any online one. */ if (cpu == NR_CPUS) @@ -193,7 +193,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, wake_up_process(p); wait_for_completion(&smdata.done); } - up(&stopmachine_mutex); + mutex_unlock(&stopmachine_mutex); return p; } diff --git a/kernel/sys.c b/kernel/sys.c index 53de35f..a626116 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -916,8 +916,8 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; - int err = -EINVAL; - struct pid_namespace *ns; + struct pid *pgrp; + int err; if (!pid) pid = task_pid_vnr(group_leader); @@ -929,12 +929,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ - ns = current->nsproxy->pid_ns; - write_lock_irq(&tasklist_lock); err = -ESRCH; - p = find_task_by_pid_ns(pid, ns); + p = find_task_by_vpid(pid); if (!p) goto out; @@ -942,7 +940,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (!thread_group_leader(p)) goto out; - if (p->real_parent->tgid == group_leader->tgid) { + if (same_thread_group(p->real_parent, group_leader)) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; @@ -959,10 +957,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->signal->leader) goto out; + pgrp = task_pid(p); if (pgid != pid) { struct task_struct *g; - g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns); + pgrp = find_vpid(pgid); + g = pid_task(pgrp, PIDTYPE_PGID); if (!g || task_session(g) != task_session(group_leader)) goto out; } @@ -971,13 +971,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (err) goto out; - if (task_pgrp_nr_ns(p, ns) != pgid) { - struct pid *pid; - + if (task_pgrp(p) != pgrp) { detach_pid(p, PIDTYPE_PGID); - pid = find_vpid(pgid); - attach_pid(p, PIDTYPE_PGID, pid); - set_task_pgrp(p, pid_nr(pid)); + attach_pid(p, PIDTYPE_PGID, pgrp); + set_task_pgrp(p, pid_nr(pgrp)); } err = 0; @@ -994,17 +991,14 @@ asmlinkage long sys_getpgid(pid_t pid) else { int retval; struct task_struct *p; - struct pid_namespace *ns; - - ns = current->nsproxy->pid_ns; read_lock(&tasklist_lock); - p = find_task_by_pid_ns(pid, ns); + p = find_task_by_vpid(pid); retval = -ESRCH; if (p) { retval = security_task_getpgid(p); if (!retval) - retval = task_pgrp_nr_ns(p, ns); + retval = task_pgrp_vnr(p); } read_unlock(&tasklist_lock); return retval; @@ -1028,19 +1022,16 @@ asmlinkage long sys_getsid(pid_t pid) else { int retval; struct task_struct *p; - struct pid_namespace *ns; - - ns = current->nsproxy->pid_ns; - read_lock(&tasklist_lock); - p = find_task_by_pid_ns(pid, ns); + rcu_read_lock(); + p = find_task_by_vpid(pid); retval = -ESRCH; if (p) { retval = security_task_getsid(p); if (!retval) - retval = task_session_nr_ns(p, ns); + retval = task_session_vnr(p); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } } @@ -1048,35 +1039,29 @@ asmlinkage long sys_getsid(pid_t pid) asmlinkage long sys_setsid(void) { struct task_struct *group_leader = current->group_leader; - pid_t session; + struct pid *sid = task_pid(group_leader); + pid_t session = pid_vnr(sid); int err = -EPERM; write_lock_irq(&tasklist_lock); - /* Fail if I am already a session leader */ if (group_leader->signal->leader) goto out; - session = group_leader->pid; /* Fail if a process group id already exists that equals the * proposed session id. - * - * Don't check if session id == 1 because kernel threads use this - * session id and so the check will always fail and make it so - * init cannot successfully call setsid. */ - if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID, - session, &init_pid_ns)) + if (pid_task(sid, PIDTYPE_PGID)) goto out; group_leader->signal->leader = 1; - __set_special_pids(session, session); + __set_special_pids(sid); spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; spin_unlock(&group_leader->sighand->siglock); - err = task_pgrp_vnr(group_leader); + err = session; out: write_unlock_irq(&tasklist_lock); return err; @@ -1145,16 +1130,16 @@ static int groups_to_user(gid_t __user *grouplist, struct group_info *group_info) { int i; - int count = group_info->ngroups; + unsigned int count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { - int cp_count = min(NGROUPS_PER_BLOCK, count); - int off = i * NGROUPS_PER_BLOCK; - int len = cp_count * sizeof(*grouplist); + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); - if (copy_to_user(grouplist+off, group_info->blocks[i], len)) + if (copy_to_user(grouplist, group_info->blocks[i], len)) return -EFAULT; + grouplist += NGROUPS_PER_BLOCK; count -= cp_count; } return 0; @@ -1165,16 +1150,16 @@ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { int i; - int count = group_info->ngroups; + unsigned int count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { - int cp_count = min(NGROUPS_PER_BLOCK, count); - int off = i * NGROUPS_PER_BLOCK; - int len = cp_count * sizeof(*grouplist); + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); - if (copy_from_user(group_info->blocks[i], grouplist+off, len)) + if (copy_from_user(group_info->blocks[i], grouplist, len)) return -EFAULT; + grouplist += NGROUPS_PER_BLOCK; count -= cp_count; } return 0; @@ -1472,7 +1457,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) + if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) return -EPERM; retval = security_task_setrlimit(resource, &new_rlim); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5e2ad5b..8b7e954 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -37,7 +37,6 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/hugetlb.h> -#include <linux/security.h> #include <linux/initrd.h> #include <linux/times.h> #include <linux/limits.h> @@ -67,14 +66,13 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; extern int sysctl_oom_kill_allocating_task; +extern int sysctl_oom_dump_tasks; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; extern int pid_max; extern int min_free_kbytes; -extern int printk_ratelimit_jiffies; -extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; @@ -313,22 +311,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_period_ms", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_ratio", - .data = &sysctl_sched_rt_ratio, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) { .ctl_name = CTL_UNNUMBERED, @@ -350,6 +332,22 @@ static struct ctl_table kern_table[] = { #endif { .ctl_name = CTL_UNNUMBERED, + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "sched_compat_yield", .data = &sysctl_sched_compat_yield, .maxlen = sizeof(unsigned int), @@ -490,14 +488,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - { - .ctl_name = KERN_PRINTK, - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, @@ -644,6 +634,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#if defined CONFIG_PRINTK + { + .ctl_name = KERN_PRINTK, + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_PRINTK_RATELIMIT, .procname = "printk_ratelimit", @@ -661,6 +660,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif { .ctl_name = KERN_NGROUPS_MAX, .procname = "ngroups_max", @@ -871,6 +871,14 @@ static struct ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, @@ -970,10 +978,10 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", - .data = &nr_overcommit_huge_pages, - .maxlen = sizeof(nr_overcommit_huge_pages), + .data = &sysctl_overcommit_huge_pages, + .maxlen = sizeof(sysctl_overcommit_huge_pages), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &hugetlb_overcommit_handler, }, #endif { @@ -1203,6 +1211,14 @@ static struct ctl_table fs_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = FS_DENTRY, .procname = "dentry-state", .data = &dentry_stat, @@ -2471,7 +2487,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp pid_t tmp; int r; - tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns); + tmp = pid_vnr(cad_pid); r = __do_proc_dointvec(&tmp, table, write, filp, buffer, lenp, ppos, NULL, NULL); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 006365b..c09350d 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -8,10 +8,10 @@ struct trans_ctl_table { int ctl_name; const char *procname; - struct trans_ctl_table *child; + const struct trans_ctl_table *child; }; -static struct trans_ctl_table trans_random_table[] = { +static const struct trans_ctl_table trans_random_table[] = { { RANDOM_POOLSIZE, "poolsize" }, { RANDOM_ENTROPY_COUNT, "entropy_avail" }, { RANDOM_READ_THRESH, "read_wakeup_threshold" }, @@ -21,13 +21,13 @@ static struct trans_ctl_table trans_random_table[] = { {} }; -static struct trans_ctl_table trans_pty_table[] = { +static const struct trans_ctl_table trans_pty_table[] = { { PTY_MAX, "max" }, { PTY_NR, "nr" }, {} }; -static struct trans_ctl_table trans_kern_table[] = { +static const struct trans_ctl_table trans_kern_table[] = { { KERN_OSTYPE, "ostype" }, { KERN_OSRELEASE, "osrelease" }, /* KERN_OSREV not used */ @@ -107,7 +107,7 @@ static struct trans_ctl_table trans_kern_table[] = { {} }; -static struct trans_ctl_table trans_vm_table[] = { +static const struct trans_ctl_table trans_vm_table[] = { { VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, { VM_PAGE_CLUSTER, "page-cluster" }, { VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, @@ -139,7 +139,7 @@ static struct trans_ctl_table trans_vm_table[] = { {} }; -static struct trans_ctl_table trans_net_core_table[] = { +static const struct trans_ctl_table trans_net_core_table[] = { { NET_CORE_WMEM_MAX, "wmem_max" }, { NET_CORE_RMEM_MAX, "rmem_max" }, { NET_CORE_WMEM_DEFAULT, "wmem_default" }, @@ -165,14 +165,14 @@ static struct trans_ctl_table trans_net_core_table[] = { {}, }; -static struct trans_ctl_table trans_net_unix_table[] = { +static const struct trans_ctl_table trans_net_unix_table[] = { /* NET_UNIX_DESTROY_DELAY unused */ /* NET_UNIX_DELETE_DELAY unused */ { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, {} }; -static struct trans_ctl_table trans_net_ipv4_route_table[] = { +static const struct trans_ctl_table trans_net_ipv4_route_table[] = { { NET_IPV4_ROUTE_FLUSH, "flush" }, { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" }, { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" }, @@ -195,7 +195,7 @@ static struct trans_ctl_table trans_net_ipv4_route_table[] = { {} }; -static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { +static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { { NET_IPV4_CONF_FORWARDING, "forwarding" }, { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, @@ -222,14 +222,14 @@ static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { {} }; -static struct trans_ctl_table trans_net_ipv4_conf_table[] = { +static const struct trans_ctl_table trans_net_ipv4_conf_table[] = { { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table }, { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table }, { 0, NULL, trans_net_ipv4_conf_vars_table }, {} }; -static struct trans_ctl_table trans_net_neigh_vars_table[] = { +static const struct trans_ctl_table trans_net_neigh_vars_table[] = { { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, { NET_NEIGH_APP_SOLICIT, "app_solicit" }, @@ -251,13 +251,13 @@ static struct trans_ctl_table trans_net_neigh_vars_table[] = { {} }; -static struct trans_ctl_table trans_net_neigh_table[] = { +static const struct trans_ctl_table trans_net_neigh_table[] = { { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table }, { 0, NULL, trans_net_neigh_vars_table }, {} }; -static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { +static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" }, @@ -294,7 +294,7 @@ static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { {} }; -static struct trans_ctl_table trans_net_ipv4_table[] = { +static const struct trans_ctl_table trans_net_ipv4_table[] = { { NET_IPV4_FORWARD, "ip_forward" }, { NET_IPV4_DYNADDR, "ip_dynaddr" }, @@ -393,13 +393,13 @@ static struct trans_ctl_table trans_net_ipv4_table[] = { {} }; -static struct trans_ctl_table trans_net_ipx_table[] = { +static const struct trans_ctl_table trans_net_ipx_table[] = { { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, /* NET_IPX_FORWARDING unused */ {} }; -static struct trans_ctl_table trans_net_atalk_table[] = { +static const struct trans_ctl_table trans_net_atalk_table[] = { { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, @@ -407,7 +407,7 @@ static struct trans_ctl_table trans_net_atalk_table[] = { {}, }; -static struct trans_ctl_table trans_net_netrom_table[] = { +static const struct trans_ctl_table trans_net_netrom_table[] = { { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, @@ -423,7 +423,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = { {} }; -static struct trans_ctl_table trans_net_ax25_param_table[] = { +static const struct trans_ctl_table trans_net_ax25_param_table[] = { { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, { NET_AX25_BACKOFF_TYPE, "backoff_type" }, @@ -441,12 +441,12 @@ static struct trans_ctl_table trans_net_ax25_param_table[] = { {} }; -static struct trans_ctl_table trans_net_ax25_table[] = { +static const struct trans_ctl_table trans_net_ax25_table[] = { { 0, NULL, trans_net_ax25_param_table }, {} }; -static struct trans_ctl_table trans_net_bridge_table[] = { +static const struct trans_ctl_table trans_net_bridge_table[] = { { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" }, @@ -455,7 +455,7 @@ static struct trans_ctl_table trans_net_bridge_table[] = { {} }; -static struct trans_ctl_table trans_net_rose_table[] = { +static const struct trans_ctl_table trans_net_rose_table[] = { { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, @@ -469,7 +469,7 @@ static struct trans_ctl_table trans_net_rose_table[] = { {} }; -static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { +static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { { NET_IPV6_FORWARDING, "forwarding" }, { NET_IPV6_HOP_LIMIT, "hop_limit" }, { NET_IPV6_MTU, "mtu" }, @@ -497,14 +497,14 @@ static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { {} }; -static struct trans_ctl_table trans_net_ipv6_conf_table[] = { +static const struct trans_ctl_table trans_net_ipv6_conf_table[] = { { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table }, { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table }, { 0, NULL, trans_net_ipv6_conf_var_table }, {} }; -static struct trans_ctl_table trans_net_ipv6_route_table[] = { +static const struct trans_ctl_table trans_net_ipv6_route_table[] = { { NET_IPV6_ROUTE_FLUSH, "flush" }, { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, { NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, @@ -518,12 +518,12 @@ static struct trans_ctl_table trans_net_ipv6_route_table[] = { {} }; -static struct trans_ctl_table trans_net_ipv6_icmp_table[] = { +static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = { { NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, {} }; -static struct trans_ctl_table trans_net_ipv6_table[] = { +static const struct trans_ctl_table trans_net_ipv6_table[] = { { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table }, { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table }, { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table }, @@ -538,7 +538,7 @@ static struct trans_ctl_table trans_net_ipv6_table[] = { {} }; -static struct trans_ctl_table trans_net_x25_table[] = { +static const struct trans_ctl_table trans_net_x25_table[] = { { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, @@ -548,13 +548,13 @@ static struct trans_ctl_table trans_net_x25_table[] = { {} }; -static struct trans_ctl_table trans_net_tr_table[] = { +static const struct trans_ctl_table trans_net_tr_table[] = { { NET_TR_RIF_TIMEOUT, "rif_timeout" }, {} }; -static struct trans_ctl_table trans_net_decnet_conf_vars[] = { +static const struct trans_ctl_table trans_net_decnet_conf_vars[] = { { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, { NET_DECNET_CONF_DEV_PRIORITY, "priority" }, { NET_DECNET_CONF_DEV_T2, "t2" }, @@ -562,12 +562,12 @@ static struct trans_ctl_table trans_net_decnet_conf_vars[] = { {} }; -static struct trans_ctl_table trans_net_decnet_conf[] = { +static const struct trans_ctl_table trans_net_decnet_conf[] = { { 0, NULL, trans_net_decnet_conf_vars }, {} }; -static struct trans_ctl_table trans_net_decnet_table[] = { +static const struct trans_ctl_table trans_net_decnet_table[] = { { NET_DECNET_CONF, "conf", trans_net_decnet_conf }, { NET_DECNET_NODE_ADDRESS, "node_address" }, { NET_DECNET_NODE_NAME, "node_name" }, @@ -585,7 +585,7 @@ static struct trans_ctl_table trans_net_decnet_table[] = { {} }; -static struct trans_ctl_table trans_net_sctp_table[] = { +static const struct trans_ctl_table trans_net_sctp_table[] = { { NET_SCTP_RTO_INITIAL, "rto_initial" }, { NET_SCTP_RTO_MIN, "rto_min" }, { NET_SCTP_RTO_MAX, "rto_max" }, @@ -606,7 +606,7 @@ static struct trans_ctl_table trans_net_sctp_table[] = { {} }; -static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { +static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { { NET_LLC2_ACK_TIMEOUT, "ack" }, { NET_LLC2_P_TIMEOUT, "p" }, { NET_LLC2_REJ_TIMEOUT, "rej" }, @@ -614,23 +614,23 @@ static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { {} }; -static struct trans_ctl_table trans_net_llc_station_table[] = { +static const struct trans_ctl_table trans_net_llc_station_table[] = { { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, {} }; -static struct trans_ctl_table trans_net_llc_llc2_table[] = { +static const struct trans_ctl_table trans_net_llc_llc2_table[] = { { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table }, {} }; -static struct trans_ctl_table trans_net_llc_table[] = { +static const struct trans_ctl_table trans_net_llc_table[] = { { NET_LLC2, "llc2", trans_net_llc_llc2_table }, { NET_LLC_STATION, "station", trans_net_llc_station_table }, {} }; -static struct trans_ctl_table trans_net_netfilter_table[] = { +static const struct trans_ctl_table trans_net_netfilter_table[] = { { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" }, { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" }, @@ -667,12 +667,12 @@ static struct trans_ctl_table trans_net_netfilter_table[] = { {} }; -static struct trans_ctl_table trans_net_dccp_table[] = { +static const struct trans_ctl_table trans_net_dccp_table[] = { { NET_DCCP_DEFAULT, "default" }, {} }; -static struct trans_ctl_table trans_net_irda_table[] = { +static const struct trans_ctl_table trans_net_irda_table[] = { { NET_IRDA_DISCOVERY, "discovery" }, { NET_IRDA_DEVNAME, "devname" }, { NET_IRDA_DEBUG, "debug" }, @@ -690,7 +690,7 @@ static struct trans_ctl_table trans_net_irda_table[] = { {} }; -static struct trans_ctl_table trans_net_table[] = { +static const struct trans_ctl_table trans_net_table[] = { { NET_CORE, "core", trans_net_core_table }, /* NET_ETHER not used */ /* NET_802 not used */ @@ -716,7 +716,7 @@ static struct trans_ctl_table trans_net_table[] = { {} }; -static struct trans_ctl_table trans_fs_quota_table[] = { +static const struct trans_ctl_table trans_fs_quota_table[] = { { FS_DQ_LOOKUPS, "lookups" }, { FS_DQ_DROPS, "drops" }, { FS_DQ_READS, "reads" }, @@ -729,7 +729,7 @@ static struct trans_ctl_table trans_fs_quota_table[] = { {} }; -static struct trans_ctl_table trans_fs_xfs_table[] = { +static const struct trans_ctl_table trans_fs_xfs_table[] = { { XFS_RESTRICT_CHOWN, "restrict_chown" }, { XFS_SGID_INHERIT, "irix_sgid_inherit" }, { XFS_SYMLINK_MODE, "irix_symlink_mode" }, @@ -750,24 +750,24 @@ static struct trans_ctl_table trans_fs_xfs_table[] = { {} }; -static struct trans_ctl_table trans_fs_ocfs2_nm_table[] = { +static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = { { 1, "hb_ctl_path" }, {} }; -static struct trans_ctl_table trans_fs_ocfs2_table[] = { +static const struct trans_ctl_table trans_fs_ocfs2_table[] = { { 1, "nm", trans_fs_ocfs2_nm_table }, {} }; -static struct trans_ctl_table trans_inotify_table[] = { +static const struct trans_ctl_table trans_inotify_table[] = { { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, { INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, {} }; -static struct trans_ctl_table trans_fs_table[] = { +static const struct trans_ctl_table trans_fs_table[] = { { FS_NRINODE, "inode-nr" }, { FS_STATINODE, "inode-state" }, /* FS_MAXINODE unused */ @@ -793,11 +793,11 @@ static struct trans_ctl_table trans_fs_table[] = { {} }; -static struct trans_ctl_table trans_debug_table[] = { +static const struct trans_ctl_table trans_debug_table[] = { {} }; -static struct trans_ctl_table trans_cdrom_table[] = { +static const struct trans_ctl_table trans_cdrom_table[] = { { DEV_CDROM_INFO, "info" }, { DEV_CDROM_AUTOCLOSE, "autoclose" }, { DEV_CDROM_AUTOEJECT, "autoeject" }, @@ -807,12 +807,12 @@ static struct trans_ctl_table trans_cdrom_table[] = { {} }; -static struct trans_ctl_table trans_ipmi_table[] = { +static const struct trans_ctl_table trans_ipmi_table[] = { { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, {} }; -static struct trans_ctl_table trans_mac_hid_files[] = { +static const struct trans_ctl_table trans_mac_hid_files[] = { /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, @@ -822,35 +822,35 @@ static struct trans_ctl_table trans_mac_hid_files[] = { {} }; -static struct trans_ctl_table trans_raid_table[] = { +static const struct trans_ctl_table trans_raid_table[] = { { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, {} }; -static struct trans_ctl_table trans_scsi_table[] = { +static const struct trans_ctl_table trans_scsi_table[] = { { DEV_SCSI_LOGGING_LEVEL, "logging_level" }, {} }; -static struct trans_ctl_table trans_parport_default_table[] = { +static const struct trans_ctl_table trans_parport_default_table[] = { { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" }, { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" }, {} }; -static struct trans_ctl_table trans_parport_device_table[] = { +static const struct trans_ctl_table trans_parport_device_table[] = { { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" }, {} }; -static struct trans_ctl_table trans_parport_devices_table[] = { +static const struct trans_ctl_table trans_parport_devices_table[] = { { DEV_PARPORT_DEVICES_ACTIVE, "active" }, { 0, NULL, trans_parport_device_table }, {} }; -static struct trans_ctl_table trans_parport_parport_table[] = { +static const struct trans_ctl_table trans_parport_parport_table[] = { { DEV_PARPORT_SPINTIME, "spintime" }, { DEV_PARPORT_BASE_ADDR, "base-addr" }, { DEV_PARPORT_IRQ, "irq" }, @@ -864,13 +864,13 @@ static struct trans_ctl_table trans_parport_parport_table[] = { { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" }, {} }; -static struct trans_ctl_table trans_parport_table[] = { +static const struct trans_ctl_table trans_parport_table[] = { { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table }, { 0, NULL, trans_parport_parport_table }, {} }; -static struct trans_ctl_table trans_dev_table[] = { +static const struct trans_ctl_table trans_dev_table[] = { { DEV_CDROM, "cdrom", trans_cdrom_table }, /* DEV_HWMON unused */ { DEV_PARPORT, "parport", trans_parport_table }, @@ -881,19 +881,19 @@ static struct trans_ctl_table trans_dev_table[] = { {} }; -static struct trans_ctl_table trans_bus_isa_table[] = { +static const struct trans_ctl_table trans_bus_isa_table[] = { { BUS_ISA_MEM_BASE, "membase" }, { BUS_ISA_PORT_BASE, "portbase" }, { BUS_ISA_PORT_SHIFT, "portshift" }, {} }; -static struct trans_ctl_table trans_bus_table[] = { +static const struct trans_ctl_table trans_bus_table[] = { { CTL_BUS_ISA, "isa", trans_bus_isa_table }, {} }; -static struct trans_ctl_table trans_arlan_conf_table0[] = { +static const struct trans_ctl_table trans_arlan_conf_table0[] = { { 1, "spreadingCode" }, { 2, "channelNumber" }, { 3, "scramblingDisable" }, @@ -964,7 +964,7 @@ static struct trans_ctl_table trans_arlan_conf_table0[] = { {} }; -static struct trans_ctl_table trans_arlan_conf_table1[] = { +static const struct trans_ctl_table trans_arlan_conf_table1[] = { { 1, "spreadingCode" }, { 2, "channelNumber" }, { 3, "scramblingDisable" }, @@ -1035,7 +1035,7 @@ static struct trans_ctl_table trans_arlan_conf_table1[] = { {} }; -static struct trans_ctl_table trans_arlan_conf_table2[] = { +static const struct trans_ctl_table trans_arlan_conf_table2[] = { { 1, "spreadingCode" }, { 2, "channelNumber" }, { 3, "scramblingDisable" }, @@ -1106,7 +1106,7 @@ static struct trans_ctl_table trans_arlan_conf_table2[] = { {} }; -static struct trans_ctl_table trans_arlan_conf_table3[] = { +static const struct trans_ctl_table trans_arlan_conf_table3[] = { { 1, "spreadingCode" }, { 2, "channelNumber" }, { 3, "scramblingDisable" }, @@ -1177,7 +1177,7 @@ static struct trans_ctl_table trans_arlan_conf_table3[] = { {} }; -static struct trans_ctl_table trans_arlan_table[] = { +static const struct trans_ctl_table trans_arlan_table[] = { { 1, "arlan0", trans_arlan_conf_table0 }, { 2, "arlan1", trans_arlan_conf_table1 }, { 3, "arlan2", trans_arlan_conf_table2 }, @@ -1185,13 +1185,13 @@ static struct trans_ctl_table trans_arlan_table[] = { {} }; -static struct trans_ctl_table trans_s390dbf_table[] = { +static const struct trans_ctl_table trans_s390dbf_table[] = { { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, {} }; -static struct trans_ctl_table trans_sunrpc_table[] = { +static const struct trans_ctl_table trans_sunrpc_table[] = { { CTL_RPCDEBUG, "rpc_debug" }, { CTL_NFSDEBUG, "nfs_debug" }, { CTL_NFSDDEBUG, "nfsd_debug" }, @@ -1203,7 +1203,7 @@ static struct trans_ctl_table trans_sunrpc_table[] = { {} }; -static struct trans_ctl_table trans_pm_table[] = { +static const struct trans_ctl_table trans_pm_table[] = { { 1 /* CTL_PM_SUSPEND */, "suspend" }, { 2 /* CTL_PM_CMODE */, "cmode" }, { 3 /* CTL_PM_P0 */, "p0" }, @@ -1211,13 +1211,13 @@ static struct trans_ctl_table trans_pm_table[] = { {} }; -static struct trans_ctl_table trans_frv_table[] = { +static const struct trans_ctl_table trans_frv_table[] = { { 1, "cache-mode" }, { 2, "pin-cxnr" }, {} }; -static struct trans_ctl_table trans_root_table[] = { +static const struct trans_ctl_table trans_root_table[] = { { CTL_KERN, "kernel", trans_kern_table }, { CTL_VM, "vm", trans_vm_table }, { CTL_NET, "net", trans_net_table }, @@ -1261,15 +1261,14 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) return table; } -static struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table) +static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table) { struct ctl_table *test; - struct trans_ctl_table *ref; - int depth, cur_depth; + const struct trans_ctl_table *ref; + int cur_depth; - depth = sysctl_depth(table); + cur_depth = sysctl_depth(table); - cur_depth = depth; ref = trans_root_table; repeat: test = sysctl_parent(table, cur_depth); @@ -1437,7 +1436,7 @@ static void sysctl_check_leaf(struct nsproxy *namespaces, static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) { - struct trans_ctl_table *ref; + const struct trans_ctl_table *ref; ref = sysctl_binary_lookup(table); if (table->ctl_name && !ref) diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 88cdb10..06b6395 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -135,6 +135,12 @@ static int test_jprobe(void) #ifdef CONFIG_KRETPROBES static u32 krph_val; +static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + krph_val = (rand1 / div_factor); + return 0; +} + static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); @@ -144,13 +150,19 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) printk(KERN_ERR "Kprobe smoke test failed: " "incorrect value in kretprobe handler\n"); } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } - krph_val = (rand1 / div_factor); + krph_val = rand1; return 0; } static struct kretprobe rp = { .handler = return_handler, + .entry_handler = entry_handler, .kp.symbol_name = "kprobe_target" }; @@ -167,7 +179,7 @@ static int test_kretprobe(void) ret = kprobe_target(rand1); unregister_kretprobe(&rp); - if (krph_val == 0) { + if (krph_val != rand1) { printk(KERN_ERR "Kprobe smoke test failed: " "kretprobe handler not called\n"); handler_errors++; diff --git a/kernel/time.c b/kernel/time.c index 4064c05..a5ec013b6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -39,6 +39,8 @@ #include <asm/uaccess.h> #include <asm/unistd.h> +#include "timeconst.h" + /* * The timezone where the local system is located. Used as a default by some * programs who obtain this value by using gettimeofday. @@ -93,7 +95,8 @@ asmlinkage long sys_stime(time_t __user *tptr) #endif /* __ARCH_WANT_SYS_TIME */ -asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) +asmlinkage long sys_gettimeofday(struct timeval __user *tv, + struct timezone __user *tz) { if (likely(tv != NULL)) { struct timeval ktv; @@ -118,7 +121,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * - * - TYT, 1992-01-01 + * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about @@ -240,7 +243,11 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else - return (j * MSEC_PER_SEC) / HZ; +# if BITS_PER_LONG == 32 + return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; +# else + return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; +# endif #endif } EXPORT_SYMBOL(jiffies_to_msecs); @@ -252,7 +259,11 @@ unsigned int inline jiffies_to_usecs(const unsigned long j) #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); #else - return (j * USEC_PER_SEC) / HZ; +# if BITS_PER_LONG == 32 + return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); @@ -267,7 +278,7 @@ EXPORT_SYMBOL(jiffies_to_usecs); * * This function should be only used for timestamps returned by * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the later. + * it doesn't handle the better resolution of the latter. */ struct timespec timespec_trunc(struct timespec t, unsigned gran) { @@ -315,7 +326,7 @@ EXPORT_SYMBOL_GPL(getnstimeofday); * This algorithm was first published by Gauss (I think). * * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines were long is 32-bit! (However, as time_t is signed, we + * machines where long is 32-bit! (However, as time_t is signed, we * will already get problems at other places on 2038-01-19 03:14:08) */ unsigned long @@ -352,7 +363,7 @@ EXPORT_SYMBOL(mktime); * normalize to the timespec storage format * * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC + * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) @@ -453,12 +464,13 @@ unsigned long msecs_to_jiffies(const unsigned int m) /* * Generic case - multiply, round and divide. But first * check that if we are doing a net multiplication, that - * we wouldnt overflow: + * we wouldn't overflow: */ if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; - return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; + return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; #endif } EXPORT_SYMBOL(msecs_to_jiffies); @@ -472,7 +484,8 @@ unsigned long usecs_to_jiffies(const unsigned int u) #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) return u * (HZ / USEC_PER_SEC); #else - return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; + return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; #endif } EXPORT_SYMBOL(usecs_to_jiffies); @@ -566,7 +579,11 @@ EXPORT_SYMBOL(jiffies_to_timeval); clock_t jiffies_to_clock_t(long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else return x / (HZ / USER_HZ); +# endif #else u64 tmp = (u64)x * TICK_NSEC; do_div(tmp, (NSEC_PER_SEC / USER_HZ)); @@ -599,7 +616,14 @@ EXPORT_SYMBOL(clock_t_to_jiffies); u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x *= USER_HZ; + do_div(x, HZ); +# elif HZ > USER_HZ do_div(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif #else /* * There are better ways that don't overflow early, @@ -611,7 +635,6 @@ u64 jiffies_64_to_clock_t(u64 x) #endif return x; } - EXPORT_SYMBOL(jiffies_64_to_clock_t); u64 nsec_to_clock_t(u64 x) @@ -646,7 +669,6 @@ u64 get_jiffies_64(void) } while (read_seqretry(&xtime_lock, seq)); return ret; } - EXPORT_SYMBOL(get_jiffies_64); #endif diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3e59fce..3d1e3e1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -133,7 +133,7 @@ static void clockevents_do_notify(unsigned long reason, void *dev) } /* - * Called after a notify add to make devices availble which were + * Called after a notify add to make devices available which were * released from the notifier call. */ static void clockevents_notify_released(void) @@ -218,6 +218,8 @@ void clockevents_exchange_device(struct clock_event_device *old, */ void clockevents_notify(unsigned long reason, void *arg) { + struct list_head *node, *tmp; + spin_lock(&clockevents_lock); clockevents_do_notify(reason, arg); @@ -227,13 +229,8 @@ void clockevents_notify(unsigned long reason, void *arg) * Unregister the clock event devices which were * released from the users in the notify chain. */ - while (!list_empty(&clockevents_released)) { - struct clock_event_device *dev; - - dev = list_entry(clockevents_released.next, - struct clock_event_device, list); - list_del(&dev->list); - } + list_for_each_safe(node, tmp, &clockevents_released) + list_del(node); break; default: break; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6e9259a..548c436 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -91,7 +91,6 @@ static void clocksource_ratewd(struct clocksource *cs, int64_t delta) cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); clocksource_change_rating(cs, 0); - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; list_del(&cs->wd_list); } @@ -363,15 +362,13 @@ void clocksource_unregister(struct clocksource *cs) static ssize_t sysfs_show_current_clocksources(struct sys_device *dev, char *buf) { - char *curr = buf; + ssize_t count = 0; spin_lock_irq(&clocksource_lock); - curr += sprintf(curr, "%s ", curr_clocksource->name); + count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); spin_unlock_irq(&clocksource_lock); - curr += sprintf(curr, "\n"); - - return curr - buf; + return count; } /** @@ -439,17 +436,20 @@ static ssize_t sysfs_show_available_clocksources(struct sys_device *dev, char *buf) { struct clocksource *src; - char *curr = buf; + ssize_t count = 0; spin_lock_irq(&clocksource_lock); list_for_each_entry(src, &clocksource_list, list) { - curr += sprintf(curr, "%s ", src->name); + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), + "%s ", src->name); } spin_unlock_irq(&clocksource_lock); - curr += sprintf(curr, "\n"); + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); - return curr - buf; + return count; } /* diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index e64efaf..c88b591 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -43,10 +43,6 @@ long time_freq; /* frequency offset (scaled ppm)*/ static long time_reftime; /* time at last adjustment (s) */ long time_adjust; -#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) -#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \ - (s64)CLOCK_TICK_RATE) - static void ntp_update_frequency(void) { u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 88267f0..fa9bb73 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -681,7 +681,7 @@ int tick_check_oneshot_change(int allow_nohz) if (ts->nohz_mode != NOHZ_MODE_INACTIVE) return 0; - if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) + if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cd5dbc4..1af9fb0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -201,9 +201,9 @@ static inline s64 __get_nsec_offset(void) { return 0; } #endif /** - * timekeeping_is_continuous - check to see if timekeeping is free running + * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ -int timekeeping_is_continuous(void) +int timekeeping_valid_for_hres(void) { unsigned long seq; int ret; @@ -364,7 +364,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * with losing too many ticks, otherwise we would overadjust and * produce an even larger error. The smaller the adjustment the * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adusted + * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl new file mode 100644 index 0000000..4146803 --- /dev/null +++ b/kernel/timeconst.pl @@ -0,0 +1,402 @@ +#!/usr/bin/perl +# ----------------------------------------------------------------------- +# +# Copyright 2007 rPath, Inc. - All Rights Reserved +# +# This file is part of the Linux kernel, and is made available under +# the terms of the GNU General Public License version 2 or (at your +# option) any later version; incorporated herein by reference. +# +# ----------------------------------------------------------------------- +# + +# +# Usage: timeconst.pl HZ > timeconst.h +# + +# Precomputed values for systems without Math::BigInt +# Generated by: +# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 +%canned_values = ( + 24 => [ + '0xa6aaaaab','0x2aaaaaa',26, + '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58, + 125,3, + '0xc49ba5e4','0x1fbe76c8b4',37, + '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69, + 3,125, + '0xa2c2aaab','0xaaaa',16, + '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48, + 125000,3, + '0xc9539b89','0x7fffbce4217d',47, + '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79, + 3,125000, + ], 32 => [ + '0xfa000000','0x6000000',27, + '0xfa00000000000000','0x600000000000000',59, + 125,4, + '0x83126e98','0xfdf3b645a',36, + '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68, + 4,125, + '0xf4240000','0x0',17, + '0xf424000000000000','0x0',49, + 31250,1, + '0x8637bd06','0x3fff79c842fa',46, + '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78, + 1,31250, + ], 48 => [ + '0xa6aaaaab','0x6aaaaaa',27, + '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59, + 125,6, + '0xc49ba5e4','0xfdf3b645a',36, + '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68, + 6,125, + '0xa2c2aaab','0x15555',17, + '0xa2c2aaaaaaaaaaab','0x1555555555555',49, + 62500,3, + '0xc9539b89','0x3fffbce4217d',46, + '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78, + 3,62500, + ], 64 => [ + '0xfa000000','0xe000000',28, + '0xfa00000000000000','0xe00000000000000',60, + 125,8, + '0x83126e98','0x7ef9db22d',35, + '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67, + 8,125, + '0xf4240000','0x0',18, + '0xf424000000000000','0x0',50, + 15625,1, + '0x8637bd06','0x1fff79c842fa',45, + '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77, + 1,15625, + ], 100 => [ + '0xa0000000','0x0',28, + '0xa000000000000000','0x0',60, + 10,1, + '0xcccccccd','0x733333333',35, + '0xcccccccccccccccd','0x73333333333333333',67, + 1,10, + '0x9c400000','0x0',18, + '0x9c40000000000000','0x0',50, + 10000,1, + '0xd1b71759','0x1fff2e48e8a7',45, + '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77, + 1,10000, + ], 122 => [ + '0x8325c53f','0xfbcda3a',28, + '0x8325c53ef368eb05','0xfbcda3ac10c9714',60, + 500,61, + '0xf9db22d1','0x7fbe76c8b',35, + '0xf9db22d0e560418a','0x7fbe76c8b43958106',67, + 61,500, + '0x8012e2a0','0x3ef36',18, + '0x8012e29f79b47583','0x3ef368eb04325',50, + 500000,61, + '0xffda4053','0x1ffffbce4217',45, + '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77, + 61,500000, + ], 128 => [ + '0xfa000000','0x1e000000',29, + '0xfa00000000000000','0x1e00000000000000',61, + 125,16, + '0x83126e98','0x3f7ced916',34, + '0x83126e978d4fdf3c','0x3f7ced916872b020c',66, + 16,125, + '0xf4240000','0x40000',19, + '0xf424000000000000','0x4000000000000',51, + 15625,2, + '0x8637bd06','0xfffbce4217d',44, + '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76, + 2,15625, + ], 200 => [ + '0xa0000000','0x0',29, + '0xa000000000000000','0x0',61, + 5,1, + '0xcccccccd','0x333333333',34, + '0xcccccccccccccccd','0x33333333333333333',66, + 1,5, + '0x9c400000','0x0',19, + '0x9c40000000000000','0x0',51, + 5000,1, + '0xd1b71759','0xfff2e48e8a7',44, + '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76, + 1,5000, + ], 250 => [ + '0x80000000','0x0',29, + '0x8000000000000000','0x0',61, + 4,1, + '0x80000000','0x180000000',33, + '0x8000000000000000','0x18000000000000000',65, + 1,4, + '0xfa000000','0x0',20, + '0xfa00000000000000','0x0',52, + 4000,1, + '0x83126e98','0x7ff7ced9168',43, + '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75, + 1,4000, + ], 256 => [ + '0xfa000000','0x3e000000',30, + '0xfa00000000000000','0x3e00000000000000',62, + 125,32, + '0x83126e98','0x1fbe76c8b',33, + '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65, + 32,125, + '0xf4240000','0xc0000',20, + '0xf424000000000000','0xc000000000000',52, + 15625,4, + '0x8637bd06','0x7ffde7210be',43, + '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75, + 4,15625, + ], 300 => [ + '0xd5555556','0x2aaaaaaa',30, + '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62, + 10,3, + '0x9999999a','0x1cccccccc',33, + '0x999999999999999a','0x1cccccccccccccccc',65, + 3,10, + '0xd0555556','0xaaaaa',20, + '0xd055555555555556','0xaaaaaaaaaaaaa',52, + 10000,3, + '0x9d495183','0x7ffcb923a29',43, + '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75, + 3,10000, + ], 512 => [ + '0xfa000000','0x7e000000',31, + '0xfa00000000000000','0x7e00000000000000',63, + 125,64, + '0x83126e98','0xfdf3b645',32, + '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64, + 64,125, + '0xf4240000','0x1c0000',21, + '0xf424000000000000','0x1c000000000000',53, + 15625,8, + '0x8637bd06','0x3ffef39085f',42, + '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74, + 8,15625, + ], 1000 => [ + '0x80000000','0x0',31, + '0x8000000000000000','0x0',63, + 1,1, + '0x80000000','0x0',31, + '0x8000000000000000','0x0',63, + 1,1, + '0xfa000000','0x0',22, + '0xfa00000000000000','0x0',54, + 1000,1, + '0x83126e98','0x1ff7ced9168',41, + '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73, + 1,1000, + ], 1024 => [ + '0xfa000000','0xfe000000',32, + '0xfa00000000000000','0xfe00000000000000',64, + 125,128, + '0x83126e98','0x7ef9db22',31, + '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63, + 128,125, + '0xf4240000','0x3c0000',22, + '0xf424000000000000','0x3c000000000000',54, + 15625,16, + '0x8637bd06','0x1fff79c842f',41, + '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73, + 16,15625, + ], 1200 => [ + '0xd5555556','0xd5555555',32, + '0xd555555555555556','0xd555555555555555',64, + 5,6, + '0x9999999a','0x66666666',31, + '0x999999999999999a','0x6666666666666666',63, + 6,5, + '0xd0555556','0x2aaaaa',22, + '0xd055555555555556','0x2aaaaaaaaaaaaa',54, + 2500,3, + '0x9d495183','0x1ffcb923a29',41, + '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73, + 3,2500, + ] +); + +$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; + +sub bint($) +{ + my($x) = @_; + return Math::BigInt->new($x); +} + +# +# Constants for division by reciprocal multiplication. +# (bits, numerator, denominator) +# +sub fmul($$$) +{ + my ($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + return scalar (($n << $b)+$d-bint(1))/$d; +} + +sub fadj($$$) +{ + my($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + $d = $d/bgcd($n, $d); + return scalar (($d-bint(1)) << $b)/$d; +} + +sub fmuls($$$) { + my($b,$n,$d) = @_; + my($s,$m); + my($thres) = bint(1) << ($b-1); + + $n = bint($n); + $d = bint($d); + + for ($s = 0; 1; $s++) { + $m = fmul($s,$n,$d); + return $s if ($m >= $thres); + } + return 0; +} + +# Provides mul, adj, and shr factors for a specific +# (bit, time, hz) combination +sub muladj($$$) { + my($b, $t, $hz) = @_; + my $s = fmuls($b, $t, $hz); + my $m = fmul($s, $t, $hz); + my $a = fadj($s, $t, $hz); + return ($m->as_hex(), $a->as_hex(), $s); +} + +# Provides numerator, denominator values +sub numden($$) { + my($n, $d) = @_; + my $g = bgcd($n, $d); + return ($n/$g, $d/$g); +} + +# All values for a specific (time, hz) combo +sub conversions($$) { + my ($t, $hz) = @_; + my @val = (); + + # HZ_TO_xx + push(@val, muladj(32, $t, $hz)); + push(@val, muladj(64, $t, $hz)); + push(@val, numden($t, $hz)); + + # xx_TO_HZ + push(@val, muladj(32, $hz, $t)); + push(@val, muladj(64, $hz, $t)); + push(@val, numden($hz, $t)); + + return @val; +} + +sub compute_values($) { + my($hz) = @_; + my @val = (); + my $s, $m, $a, $g; + + if (!$has_bigint) { + die "$0: HZ == $hz not canned and ". + "Math::BigInt not available\n"; + } + + # MSEC conversions + push(@val, conversions(1000, $hz)); + + # USEC conversions + push(@val, conversions(1000000, $hz)); + + return @val; +} + +sub output($@) +{ + my($hz, @val) = @_; + my $pfx, $bit, $suf, $s, $m, $a; + + print "/* Automatically generated by kernel/timeconst.pl */\n"; + print "/* Conversion constants for HZ == $hz */\n"; + print "\n"; + print "#ifndef KERNEL_TIMECONST_H\n"; + print "#define KERNEL_TIMECONST_H\n"; + print "\n"; + + print "#include <linux/param.h>\n"; + + print "\n"; + print "#if HZ != $hz\n"; + print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; + print "#endif\n"; + print "\n"; + + foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', + 'HZ_TO_USEC','USEC_TO_HZ') { + foreach $bit (32, 64) { + foreach $suf ('MUL', 'ADJ', 'SHR') { + printf "#define %-23s %s\n", + "${pfx}_$suf$bit", shift(@val); + } + } + foreach $suf ('NUM', 'DEN') { + printf "#define %-23s %s\n", + "${pfx}_$suf", shift(@val); + } + } + + print "\n"; + print "#endif /* KERNEL_TIMECONST_H */\n"; +} + +($hz) = @ARGV; + +# Use this to generate the %canned_values structure +if ($hz eq '--can') { + shift(@ARGV); + @hzlist = sort {$a <=> $b} (@ARGV); + + print "# Precomputed values for systems without Math::BigInt\n"; + print "# Generated by:\n"; + print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; + print "\%canned_values = (\n"; + my $pf = "\t"; + foreach $hz (@hzlist) { + my @values = compute_values($hz); + print "$pf$hz => [\n"; + while (scalar(@values)) { + my $bit; + foreach $bit (32, 64) { + my $m = shift(@values); + my $a = shift(@values); + my $s = shift(@values); + print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n"; + } + my $n = shift(@values); + my $d = shift(@values); + print "\t\t",$n,',',$d,",\n"; + } + print "\t]"; + $pf = ', '; + } + print "\n);\n"; +} else { + $hz += 0; # Force to number + if ($hz < 1) { + die "Usage: $0 HZ\n"; + } + + @val = @{$canned_values{$hz}}; + if (!defined(@val)) { + @val = compute_values($hz); + } + output($hz, @val); +} +exit 0; diff --git a/kernel/timer.c b/kernel/timer.c index 9fbb472..99b00a2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -327,7 +327,7 @@ static void timer_stats_account_timer(struct timer_list *timer) {} * init_timer() must be done to a timer prior calling *any* of the * other timer functions. */ -void fastcall init_timer(struct timer_list *timer) +void init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); @@ -339,7 +339,7 @@ void fastcall init_timer(struct timer_list *timer) } EXPORT_SYMBOL(init_timer); -void fastcall init_timer_deferrable(struct timer_list *timer) +void init_timer_deferrable(struct timer_list *timer) { init_timer(timer); timer_set_deferrable(timer); @@ -818,12 +818,14 @@ unsigned long next_timer_interrupt(void) #ifndef CONFIG_VIRT_CPU_ACCOUNTING void account_process_tick(struct task_struct *p, int user_tick) { + cputime_t one_jiffy = jiffies_to_cputime(1); + if (user_tick) { - account_user_time(p, jiffies_to_cputime(1)); - account_user_time_scaled(p, jiffies_to_cputime(1)); + account_user_time(p, one_jiffy); + account_user_time_scaled(p, cputime_to_scaled(one_jiffy)); } else { - account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); - account_system_time_scaled(p, jiffies_to_cputime(1)); + account_system_time(p, HARDIRQ_OFFSET, one_jiffy); + account_system_time_scaled(p, cputime_to_scaled(one_jiffy)); } } #endif @@ -977,7 +979,7 @@ asmlinkage long sys_getppid(void) int pid; rcu_read_lock(); - pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns); + pid = task_tgid_vnr(current->real_parent); rcu_read_unlock(); return pid; @@ -1040,7 +1042,7 @@ static void process_timeout(unsigned long __data) * * In all cases the return value is guaranteed to be non-negative. */ -fastcall signed long __sched schedule_timeout(signed long timeout) +signed long __sched schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; diff --git a/kernel/user.c b/kernel/user.c index bc1c48d..7132022 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -17,6 +17,14 @@ #include <linux/module.h> #include <linux/user_namespace.h> +struct user_namespace init_user_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .root_user = &root_user, +}; +EXPORT_SYMBOL_GPL(init_user_ns); + /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). @@ -49,7 +57,7 @@ struct user_struct root_user = { .uid_keyring = &root_user_keyring, .session_keyring = &root_session_keyring, #endif -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED .tg = &init_task_group, #endif }; @@ -82,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) return NULL; } -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED static void sched_destroy_user(struct user_struct *up) { @@ -105,15 +113,15 @@ static void sched_switch_user(struct task_struct *p) sched_move_task(p); } -#else /* CONFIG_FAIR_USER_SCHED */ +#else /* CONFIG_USER_SCHED */ static void sched_destroy_user(struct user_struct *up) { } static int sched_create_user(struct user_struct *up) { return 0; } static void sched_switch_user(struct task_struct *p) { } -#endif /* CONFIG_FAIR_USER_SCHED */ +#endif /* CONFIG_USER_SCHED */ -#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) +#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ static DEFINE_MUTEX(uids_mutex); @@ -129,6 +137,7 @@ static inline void uids_mutex_unlock(void) } /* uid directory attributes */ +#ifdef CONFIG_FAIR_GROUP_SCHED static ssize_t cpu_shares_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -155,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj, static struct kobj_attribute cpu_share_attr = __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static ssize_t cpu_rt_runtime_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + + return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); +} + +static ssize_t cpu_rt_runtime_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t size) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + unsigned long rt_runtime; + int rc; + + sscanf(buf, "%lu", &rt_runtime); + + rc = sched_group_set_rt_runtime(up->tg, rt_runtime); + + return (rc ? rc : size); +} + +static struct kobj_attribute cpu_rt_runtime_attr = + __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); +#endif /* default attributes per uid directory */ static struct attribute *uids_attributes[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED &cpu_share_attr.attr, +#endif +#ifdef CONFIG_RT_GROUP_SCHED + &cpu_rt_runtime_attr.attr, +#endif NULL }; @@ -261,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags) schedule_work(&up->work); } -#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ +#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ int uids_sysfs_init(void) { return 0; } static inline int uids_user_create(struct user_struct *up) { return 0; } @@ -365,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - /* This case is not possible when CONFIG_FAIR_USER_SCHED + /* This case is not possible when CONFIG_USER_SCHED * is defined, since we serialize alloc_uid() using * uids_mutex. Hence no need to call * sched_destroy_user() or remove_user_sysfs_dir(). @@ -427,6 +471,7 @@ void switch_uid(struct user_struct *new_user) suid_keys(current); } +#ifdef CONFIG_USER_NS void release_uids(struct user_namespace *ns) { int i; @@ -451,6 +496,7 @@ void release_uids(struct user_namespace *ns) free_uid(ns->root_user); } +#endif static int __init uid_cache_init(void) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 7af90fc..4c90062 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -10,17 +10,6 @@ #include <linux/nsproxy.h> #include <linux/user_namespace.h> -struct user_namespace init_user_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, - .root_user = &root_user, -}; - -EXPORT_SYMBOL_GPL(init_user_ns); - -#ifdef CONFIG_USER_NS - /* * Clone a new ns copying an original user ns, setting refcount to 1 * @old_ns: namespace to clone @@ -84,5 +73,3 @@ void free_user_ns(struct kref *kref) release_uids(ns); kfree(ns); } - -#endif /* CONFIG_USER_NS */ diff --git a/kernel/wait.c b/kernel/wait.c index f987688..c275c56 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -18,7 +18,7 @@ void init_waitqueue_head(wait_queue_head_t *q) EXPORT_SYMBOL(init_waitqueue_head); -void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; @@ -29,7 +29,7 @@ void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(add_wait_queue); -void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; @@ -40,7 +40,7 @@ void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(add_wait_queue_exclusive); -void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; @@ -63,7 +63,7 @@ EXPORT_SYMBOL(remove_wait_queue); * stops them from bleeding out - it would still allow subsequent * loads to move into the critical region). */ -void fastcall +void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; @@ -82,7 +82,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait); -void fastcall +void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; @@ -101,7 +101,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); -void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; @@ -157,7 +157,7 @@ EXPORT_SYMBOL(wake_bit_function); * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are * permitted return codes. Nonzero return codes halt waiting and return. */ -int __sched fastcall +int __sched __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, int (*action)(void *), unsigned mode) { @@ -173,7 +173,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, } EXPORT_SYMBOL(__wait_on_bit); -int __sched fastcall out_of_line_wait_on_bit(void *word, int bit, +int __sched out_of_line_wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); @@ -183,7 +183,7 @@ int __sched fastcall out_of_line_wait_on_bit(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit); -int __sched fastcall +int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, int (*action)(void *), unsigned mode) { @@ -201,7 +201,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, } EXPORT_SYMBOL(__wait_on_bit_lock); -int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit, +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); @@ -211,7 +211,7 @@ int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); if (waitqueue_active(wq)) @@ -236,13 +236,13 @@ EXPORT_SYMBOL(__wake_up_bit); * may need to use a less regular barrier, such fs/inode.c's smp_mb(), * because spin_unlock() does not guarantee a memory barrier. */ -void fastcall wake_up_bit(void *word, int bit) +void wake_up_bit(void *word, int bit) { __wake_up_bit(bit_waitqueue(word, bit), word, bit); } EXPORT_SYMBOL(wake_up_bit); -fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit) +wait_queue_head_t *bit_waitqueue(void *word, int bit) { const int shift = BITS_PER_LONG == 32 ? 5 : 6; const struct zone *zone = page_zone(virt_to_page(word)); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 52db48e..ff06611 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -161,7 +161,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. */ -int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) +int queue_work(struct workqueue_struct *wq, struct work_struct *work) { int ret = 0; @@ -175,7 +175,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work); -void delayed_work_timer_fn(unsigned long __data) +static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); @@ -192,7 +192,7 @@ void delayed_work_timer_fn(unsigned long __data) * * Returns 0 if @work was already on a queue, non-zero otherwise. */ -int fastcall queue_delayed_work(struct workqueue_struct *wq, +int queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { timer_stats_timer_set_start_info(&dwork->timer); @@ -388,7 +388,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * This function used to run the workqueues itself. Now we just wait for the * helper threads to do it. */ -void fastcall flush_workqueue(struct workqueue_struct *wq) +void flush_workqueue(struct workqueue_struct *wq) { const cpumask_t *cpu_map = wq_cpu_map(wq); int cpu; @@ -546,7 +546,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; * * This puts a job in the kernel-global workqueue. */ -int fastcall schedule_work(struct work_struct *work) +int schedule_work(struct work_struct *work) { return queue_work(keventd_wq, work); } @@ -560,7 +560,7 @@ EXPORT_SYMBOL(schedule_work); * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int fastcall schedule_delayed_work(struct delayed_work *dwork, +int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { timer_stats_timer_set_start_info(&dwork->timer); |