diff options
Diffstat (limited to 'kernel')
42 files changed, 1568 insertions, 2147 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 5e3f3b7..14f4d45 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -31,7 +31,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o -obj-$(CONFIG_STACK_UNWIND) += unwind.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o diff --git a/kernel/acct.c b/kernel/acct.c index dc12db8..70d0d88 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry, &sbuf)) + if (vfs_statfs(file->f_path.dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file) add_timer(&acct_globals.timer); } if (old_acct) { - mnt_unpin(old_acct->f_vfsmnt); + mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); do_acct_process(old_acct); filp_close(old_acct, NULL); @@ -212,7 +212,7 @@ static int acct_on(char *name) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { filp_close(file, NULL); return -EACCES; } @@ -229,11 +229,11 @@ static int acct_on(char *name) } spin_lock(&acct_globals.lock); - mnt_pin(file->f_vfsmnt); + mnt_pin(file->f_path.mnt); acct_file_reopen(file); spin_unlock(&acct_globals.lock); - mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ + mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ return 0; } @@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) void acct_auto_close_mnt(struct vfsmount *m) { spin_lock(&acct_globals.lock); - if (acct_globals.file && acct_globals.file->f_vfsmnt == m) + if (acct_globals.file && acct_globals.file->f_path.mnt == m) acct_file_reopen(NULL); spin_unlock(&acct_globals.lock); } @@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb) { spin_lock(&acct_globals.lock); if (acct_globals.file && - acct_globals.file->f_vfsmnt->mnt_sb == sb) { + acct_globals.file->f_path.mnt->mnt_sb == sb) { acct_file_reopen(NULL); } spin_unlock(&acct_globals.lock); @@ -428,6 +428,7 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + struct tty_struct *tty; /* * First check to see if there is enough free_space to continue @@ -484,16 +485,9 @@ static void do_acct_process(struct file *file) ac.ac_ppid = current->parent->tgid; #endif - mutex_lock(&tty_mutex); - /* FIXME: Whoever is responsible for current->signal locking needs - to use the same locking all over the kernel and document it */ - read_lock(&tasklist_lock); - ac.ac_tty = current->signal->tty ? - old_encode_dev(tty_devnum(current->signal->tty)) : 0; - read_unlock(&tasklist_lock); - mutex_unlock(&tty_mutex); - spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2e896f8..9c8c232 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -800,8 +800,8 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, /* our own copy of se_str */ se_str = kstrdup(sf->se_str, GFP_KERNEL); - if (unlikely(IS_ERR(se_str))) - return -ENOMEM; + if (unlikely(!se_str)) + return -ENOMEM; df->se_str = se_str; /* our own (refreshed) copy of se_rule */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 40722e2..2988975 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_dentry, - vma->vm_file->f_vfsmnt); + vma->vm_file->f_path.dentry, + vma->vm_file->f_path.mnt); break; } vma = vma->vm_next; @@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->return_code); mutex_lock(&tty_mutex); + read_lock(&tasklist_lock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; + read_unlock(&tasklist_lock); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" diff --git a/kernel/cpu.c b/kernel/cpu.c index 9124669..7406fe6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -204,7 +204,7 @@ int cpu_down(unsigned int cpu) #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ -static int __devinit _cpu_up(unsigned int cpu) +static int __cpuinit _cpu_up(unsigned int cpu) { int ret; void *hcpu = (void *)(long)cpu; @@ -239,7 +239,7 @@ out_notify: return ret; } -int __devinit cpu_up(unsigned int cpu) +int __cpuinit cpu_up(unsigned int cpu) { int err = 0; @@ -258,7 +258,7 @@ static cpumask_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error = 0; mutex_lock(&cpu_add_remove_lock); first_cpu = first_cpu(cpu_present_map); @@ -294,7 +294,7 @@ int disable_nonboot_cpus(void) /* Make sure the CPUs won't be enabled by someone else */ cpu_hotplug_disabled = 1; } else { - printk(KERN_ERR "Non-boot CPUs are not disabled"); + printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } out: mutex_unlock(&cpu_add_remove_lock); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0a6b4d8..6b05dc6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { * * * When reading/writing to a file: - * - the cpuset to use in file->f_dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_dentry->d_fsdata + * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { @@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); - struct cftype *cft = __d_cft(file->f_dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); cpuset_filetype_t type = cft->private; char *buffer; char *pathbuf = NULL; @@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct cftype *cft = __d_cft(file->f_dentry); - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) if (err) return err; - cft = __d_cft(file->f_dentry); + cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; if (cft->open) @@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) static int cpuset_file_release(struct inode *inode, struct file *file) { - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (cft->release) return cft->release(inode, file); return 0; @@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); struct ctr_struct *ctr; pid_t *pidarray; int npids; @@ -2342,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) } /** - * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? * @z: is this zone on an allowed node? - * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) + * @gfp_mask: memory allocation flags * - * If we're in interrupt, yes, we can always allocate. If zone + * If we're in interrupt, yes, we can always allocate. If + * __GFP_THISNODE is set, yes, we can always allocate. If zone * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest * mem_exclusive cpuset ancestor to this tasks cpuset, yes. * Otherwise, no. * + * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() + * reduces to cpuset_zone_allowed_hardwall(). Otherwise, + * cpuset_zone_allowed_softwall() might sleep, and might allow a zone + * from an enclosing cpuset. + * + * cpuset_zone_allowed_hardwall() only handles the simpler case of + * hardwall cpusets, and never sleeps. + * + * The __GFP_THISNODE placement logic is really handled elsewhere, + * by forcibly using a zonelist starting at a specified node, and by + * (in get_page_from_freelist()) refusing to consider the zones for + * any node on the zonelist except the first. By the time any such + * calls get to this routine, we should just shut up and say 'yes'. + * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset. * GFP_KERNEL allocations are not so marked, so can escape to the - * nearest mem_exclusive ancestor cpuset. + * nearest enclosing mem_exclusive ancestor cpuset. * - * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() - * routine only calls here with __GFP_HARDWALL bit _not_ set if - * it's a GFP_KERNEL allocation, and all nodes in the current tasks - * mems_allowed came up empty on the first pass over the zonelist. - * So only GFP_KERNEL allocations, if all nodes in the cpuset are - * short of memory, might require taking the callback_mutex mutex. + * Scanning up parent cpusets requires callback_mutex. The + * __alloc_pages() routine only calls here with __GFP_HARDWALL bit + * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the + * current tasks mems_allowed came up empty on the first pass over + * the zonelist. So only GFP_KERNEL allocations, if all nodes in the + * cpuset are short of memory, might require taking the callback_mutex + * mutex. * * The first call here from mm/page_alloc:get_page_from_freelist() - * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so - * no allocation on a node outside the cpuset is allowed (unless in - * interrupt, of course). + * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, + * so no allocation on a node outside the cpuset is allowed (unless + * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() @@ -2380,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_USER - only nodes in current tasks mems allowed ok. * * Rule: - * Don't call cpuset_zone_allowed() if you can't sleep, unless you + * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables * the code that might scan up ancestor cpusets and sleep. - **/ + */ -int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ @@ -2415,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return allowed; } +/* + * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? + * @z: is this zone on an allowed node? + * @gfp_mask: memory allocation flags + * + * If we're in interrupt, yes, we can always allocate. + * If __GFP_THISNODE is set, yes, we can always allocate. If zone + * z's node is in our tasks mems_allowed, yes. Otherwise, no. + * + * The __GFP_THISNODE placement logic is really handled elsewhere, + * by forcibly using a zonelist starting at a specified node, and by + * (in get_page_from_freelist()) refusing to consider the zones for + * any node on the zonelist except the first. By the time any such + * calls get to this routine, we should just shut up and say 'yes'. + * + * Unlike the cpuset_zone_allowed_softwall() variant, above, + * this variant requires that the zone be in the current tasks + * mems_allowed or that we're in interrupt. It does not scan up the + * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. + * It never sleeps. + */ + +int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +{ + int node; /* node that zone z is on */ + + if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) + return 1; + node = zone_to_nid(z); + if (node_isset(node, current->mems_allowed)) + return 1; + return 0; +} + /** * cpuset_lock - lock out any changes to cpuset structures * @@ -2606,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file) return single_open(file, proc_cpuset_show, pid); } -const struct file_operations proc_cpuset_operations = { +struct file_operations proc_cpuset_operations = { .open = cpuset_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/exit.c b/kernel/exit.c index 4e3f919..fec12eb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,7 +13,7 @@ #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/key.h> #include <linux/security.h> #include <linux/cpu.h> @@ -22,6 +22,7 @@ #include <linux/file.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> +#include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> @@ -48,7 +49,6 @@ #include <asm/mmu_context.h> extern void sem_exit (void); -extern struct task_struct *child_reaper; static void exit_mm(struct task_struct * tsk); @@ -189,21 +189,18 @@ repeat: int session_of_pgrp(int pgrp) { struct task_struct *p; - int sid = -1; + int sid = 0; read_lock(&tasklist_lock); - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { - if (p->signal->session > 0) { - sid = p->signal->session; - goto out; - } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); - p = find_task_by_pid(pgrp); - if (p) - sid = p->signal->session; -out: + + p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); + if (p == NULL) + p = find_task_by_pid(pgrp); + if (p != NULL) + sid = process_session(p); + read_unlock(&tasklist_lock); - + return sid; } @@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) || p->exit_state || is_init(p->real_parent)) continue; - if (process_group(p->real_parent) != pgrp - && p->real_parent->signal->session == p->signal->session) { + if (process_group(p->real_parent) != pgrp && + process_session(p->real_parent) == process_session(p)) { ret = 0; break; } @@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task. + * reparent_to_init - Reparent the calling kernel thread to the init task + * of the pid space that the thread belongs to. * * If a kernel thread is launched as a result of a system call, or if * it ever exits, it should generally reparent itself to init so that @@ -278,8 +276,8 @@ static void reparent_to_init(void) ptrace_unlink(current); /* Reparent to init */ remove_parent(current); - current->parent = child_reaper; - current->real_parent = child_reaper; + current->parent = child_reaper(current); + current->real_parent = child_reaper(current); add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ @@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current->group_leader; - if (curr->signal->session != session) { + if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); - curr->signal->session = session; + set_signal_session(curr->signal, session); attach_pid(curr, PIDTYPE_SID, session); } if (process_group(curr) != pgrp) { @@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) } } -void set_special_pids(pid_t session, pid_t pgrp) +static void set_special_pids(pid_t session, pid_t pgrp) { write_lock_irq(&tasklist_lock); __set_special_pids(session, pgrp); @@ -384,9 +382,7 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - mutex_lock(&tty_mutex); - current->signal->tty = NULL; - mutex_unlock(&tty_mutex); + proc_clear_tty(current); /* Block and flush all signals */ sigfillset(&blocked); @@ -429,7 +425,7 @@ static void close_files(struct files_struct * files) for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= fdt->max_fdset || i >= fdt->max_fds) + if (i >= fdt->max_fds) break; set = fdt->open_fds->fds_bits[j++]; while (set) { @@ -470,9 +466,7 @@ void fastcall put_files_struct(struct files_struct *files) * you can free files immediately. */ fdt = files_fdtable(files); - if (fdt == &files->fdtab) - fdt->free_files = files; - else + if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); free_fdtable(fdt); } @@ -603,10 +597,6 @@ choose_new_parent(struct task_struct *p, struct task_struct *reaper) static void reparent_thread(struct task_struct *p, struct task_struct *father, int traced) { - /* We don't want people slaying init. */ - if (p->exit_signal != -1) - p->exit_signal = SIGCHLD; - if (p->pdeath_signal) /* We already hold the tasklist_lock here. */ group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); @@ -626,13 +616,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) p->parent = p->real_parent; add_parent(p); - /* If we'd notified the old parent about this child's death, - * also notify the new parent. - */ - if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && - thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - else if (p->state == TASK_TRACED) { + if (p->state == TASK_TRACED) { /* * If it was at a trace stop, turn it into * a normal stop since it's no longer being @@ -642,6 +626,23 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) } } + /* If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (p->real_parent->group_leader == father->group_leader) + return; + + /* We don't want people slaying init. */ + if (p->exit_signal != -1) + p->exit_signal = SIGCHLD; + + /* If we'd notified the old parent about this child's death, + * also notify the new parent. + */ + if (!traced && p->exit_state == EXIT_ZOMBIE && + p->exit_signal != -1 && thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + /* * process group orphan check * Case ii: Our child is in a different pgrp @@ -649,10 +650,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * outside, so the child pgrp is now orphaned. */ if ((process_group(p) != process_group(father)) && - (p->signal->session == father->signal->session)) { + (process_session(p) == process_session(father))) { int pgrp = process_group(p); - if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + if (will_become_orphaned_pgrp(pgrp, NULL) && + has_stopped_jobs(pgrp)) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } @@ -663,7 +665,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * When we die, we re-parent all our children. * Try to give them to another thread in our thread * group, and if no such member exists, give it to - * the global child reaper process (ie "init") + * the child reaper process (ie "init") in our pid + * space. */ static void forget_original_parent(struct task_struct *father, struct list_head *to_release) @@ -674,7 +677,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) do { reaper = next_thread(reaper); if (reaper == father) { - reaper = child_reaper; + reaper = child_reaper(father); break; } } while (reaper->exit_state); @@ -786,7 +789,7 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; if ((process_group(t) != process_group(tsk)) && - (t->signal->session == tsk->signal->session) && + (process_session(t) == process_session(tsk)) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); @@ -860,8 +863,13 @@ fastcall NORET_TYPE void do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(tsk == child_reaper)) - panic("Attempted to kill init!"); + if (unlikely(tsk == child_reaper(tsk))) { + if (tsk->nsproxy->pid_ns != &init_pid_ns) + tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; + else + panic("Attempted to kill init!"); + } + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; @@ -930,8 +938,8 @@ fastcall NORET_TYPE void do_exit(long code) tsk->exit_code = code; proc_exit_connector(tsk); - exit_notify(tsk); exit_task_namespaces(tsk); + exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; diff --git a/kernel/fork.c b/kernel/fork.c index 7f2e31b..d57118d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -18,7 +18,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/personality.h> #include <linux/mempolicy.h> #include <linux/sem.h> @@ -36,6 +36,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> +#include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> @@ -202,7 +203,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct mempolicy *pol; down_write(&oldmm->mmap_sem); - flush_cache_mm(oldmm); + flush_cache_dup_mm(oldmm); /* * Not linked in yet - no deadlock potential: */ @@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) anon_vma_link(tmp); file = tmp->vm_file; if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); @@ -613,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) static int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fdset; + int size = fdt->max_fds; int i; /* Find the last open fd */ @@ -640,12 +641,10 @@ static struct files_struct *alloc_files(void) newf->next_fd = 0; fdt = &newf->fdtab; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = EMBEDDED_FD_SET_SIZE; fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); - fdt->free_files = NULL; fdt->next = NULL; rcu_assign_pointer(newf->fdt, fdt); out: @@ -661,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, expand; + int open_files, size, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -672,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); new_fdt = files_fdtable(newf); - size = old_fdt->max_fdset; open_files = count_open_files(old_fdt); - expand = 0; /* - * Check whether we need to allocate a larger fd array or fd set. - * Note: we're not a clone task, so the open count won't change. + * Check whether we need to allocate a larger fd array and fd set. + * Note: we're not a clone task, so the open count won't change. */ - if (open_files > new_fdt->max_fdset) { - new_fdt->max_fdset = 0; - expand = 1; - } if (open_files > new_fdt->max_fds) { new_fdt->max_fds = 0; - expand = 1; - } - - /* if the old fdset gets grown now, we'll only copy up to "size" fds */ - if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); *errorp = expand_files(newf, open_files-1); @@ -710,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -736,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (new_fdt->max_fdset > open_files) { - int left = (new_fdt->max_fdset-open_files)/8; + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&new_fdt->open_fds->fds_bits[start], 0, left); memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } -out: return newf; out_release: - free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); - free_fdset (new_fdt->open_fds, new_fdt->max_fdset); - free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); +out: return NULL; } @@ -1055,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; @@ -1259,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; + set_signal_session(p->signal, process_session(current)); attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); + attach_pid(p, PIDTYPE_SID, process_session(p)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; @@ -1325,7 +1313,7 @@ noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_re return regs; } -struct task_struct * __devinit fork_idle(int cpu) +struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; struct pt_regs regs; @@ -1525,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the namespace structure if it is being shared + * Unshare the mnt_namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +static int unshare_mnt_namespace(unsigned long unshare_flags, + struct mnt_namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->nsproxy->namespace; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); if (!*new_nsp) return -ENOMEM; } @@ -1544,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new } /* - * Unsharing of sighand for tasks created with CLONE_SIGHAND is not - * supported yet + * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) { struct sighand_struct *sigh = current->sighand; - if ((unshare_flags & CLONE_SIGHAND) && - (sigh && atomic_read(&sigh->count) > 1)) + if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) return -EINVAL; else return 0; @@ -1625,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct namespace *ns, *new_ns = NULL; - struct sighand_struct *sigh, *new_sigh = NULL; + struct mnt_namespace *ns, *new_ns = NULL; + struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; @@ -1647,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; @@ -1671,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + if (new_fs || new_ns || new_mm || new_fd || new_ulist || new_uts || new_ipc) { task_lock(current); @@ -1688,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->nsproxy->namespace; - current->nsproxy->namespace = new_ns; + ns = current->nsproxy->mnt_ns; + current->nsproxy->mnt_ns = new_ns; new_ns = ns; } - if (new_sigh) { - sigh = current->sighand; - rcu_assign_pointer(current->sighand, new_sigh); - new_sigh = sigh; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; @@ -1756,7 +1737,7 @@ bad_unshare_cleanup_sigh: bad_unshare_cleanup_ns: if (new_ns) - put_namespace(new_ns); + put_mnt_ns(new_ns); bad_unshare_cleanup_fs: if (new_fs) diff --git a/kernel/futex.c b/kernel/futex.c index 95989a3..5a737de 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) /* * Get parameters which are the keys for a futex. * - * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, + * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * @@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) /* * Linear file mappings are also simple. */ - key->shared.inode = vma->vm_file->f_dentry->d_inode; + key->shared.inode = vma->vm_file->f_path.dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) @@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal) goto out; } filp->f_op = &futex_fops; - filp->f_vfsmnt = mntget(futex_mnt); - filp->f_dentry = dget(futex_mnt->mnt_root); - filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + filp->f_path.mnt = mntget(futex_mnt); + filp->f_path.dentry = dget(futex_mnt->mnt_root); + filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; if (signal) { err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ebfd24a..475e8a7 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -39,6 +39,7 @@ void dynamic_irq_init(unsigned int irq) desc->chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; desc->depth = 1; + desc->msi_desc = NULL; desc->handler_data = NULL; desc->chip_data = NULL; desc->action = NULL; @@ -74,6 +75,9 @@ void dynamic_irq_cleanup(unsigned int irq) WARN_ON(1); return; } + desc->msi_desc = NULL; + desc->handler_data = NULL; + desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; spin_unlock_irqrestore(&desc->lock, flags); @@ -162,6 +166,30 @@ int set_irq_data(unsigned int irq, void *data) EXPORT_SYMBOL(set_irq_data); /** + * set_irq_data - set irq type data for an irq + * @irq: Interrupt number + * @data: Pointer to interrupt specific data + * + * Set the hardware irq controller data for an irq + */ +int set_irq_msi(unsigned int irq, struct msi_desc *entry) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR + "Trying to install msi data for IRQ%d\n", irq); + return -EINVAL; + } + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->msi_desc = entry; + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/** * set_irq_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data @@ -517,10 +545,9 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) handle = handle_bad_irq; - - if (desc->chip == &no_irq_chip) { + else if (desc->chip == &no_irq_chip) { printk(KERN_WARNING "Trying to install %sinterrupt handler " - "for IRQ%d\n", is_chained ? "chained " : " ", irq); + "for IRQ%d\n", is_chained ? "chained " : "", irq); /* * Some ARM implementations install a handler for really dumb * interrupt hardware without setting an irq_chip. This worked diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b385878..c4b7ed1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -315,6 +315,9 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Undo nested disables: */ desc->depth = 1; } + /* Reset broken irq detection when installing new handler */ + desc->irq_count = 0; + desc->irqs_unhandled = 0; spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; @@ -479,3 +482,89 @@ int request_irq(unsigned int irq, irq_handler_t handler, return retval; } EXPORT_SYMBOL(request_irq); + +/* + * Device resource management aware IRQ request/free implementation. + */ +struct irq_devres { + unsigned int irq; + void *dev_id; +}; + +static void devm_irq_release(struct device *dev, void *res) +{ + struct irq_devres *this = res; + + free_irq(this->irq, this->dev_id); +} + +static int devm_irq_match(struct device *dev, void *res, void *data) +{ + struct irq_devres *this = res, *match = data; + + return this->irq == match->irq && this->dev_id == match->dev_id; +} + +/** + * devm_request_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, dev_free_irq() must be used. + */ +int devm_request_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + kfree(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_irq); + +/** + * devm_free_irq - free an interrupt + * @dev: device to free interrupt for + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as free_irq(). + * This function instead of free_irq() should be used to manually + * free IRQs allocated with dev_request_irq(). + */ +void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) +{ + struct irq_devres match_data = { irq, dev_id }; + + free_irq(irq, dev_id); + WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, + &match_data)); +} +EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9a35266..61f5c71 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned int irq = (int)(long)data, full_count = count, err; cpumask_t new_value, tmp; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) + if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + CHECK_IRQ_PER_CPU(irq_desc[irq].status)) return -EIO; err = cpumask_parse_user(buffer, count, new_value); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 543ea2e..9d8c79b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -176,7 +176,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, int noirqdebug __read_mostly; -int __init noirqdebug_setup(char *str) +int noirqdebug_setup(char *str) { noirqdebug = 1; printk(KERN_INFO "IRQ lockup detection disabled\n"); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab63cfc..6f294ff 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -31,14 +31,14 @@ #endif /* These will be re-linked against their real values during the second link stage */ -extern unsigned long kallsyms_addresses[] __attribute__((weak)); -extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); -extern u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __attribute__((weak)); +extern const unsigned long kallsyms_num_syms __attribute__((weak)); +extern const u8 kallsyms_names[] __attribute__((weak)); -extern u8 kallsyms_token_table[] __attribute__((weak)); -extern u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __attribute__((weak)); +extern const u16 kallsyms_token_index[] __attribute__((weak)); -extern unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __attribute__((weak)); static inline int is_kernel_inittext(unsigned long addr) { @@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr) static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) { int len, skipped_first = 0; - u8 *tptr, *data; + const u8 *tptr, *data; /* get the compressed symbol length from the first symbol byte */ data = &kallsyms_names[off]; @@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off) * kallsyms array */ static unsigned int get_symbol_offset(unsigned long pos) { - u8 *name; + const u8 *name; int i; /* use the closest marker we have. We have markers every 256 positions, diff --git a/kernel/kmod.c b/kernel/kmod.c index 8d2bea0..3a7379a 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,7 +25,7 @@ #include <linux/kmod.h> #include <linux/smp_lock.h> #include <linux/slab.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/completion.h> #include <linux/file.h> #include <linux/workqueue.h> diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 17ec4af..6fcf8dd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -87,6 +87,12 @@ struct kprobe_insn_page { int ngarbage; }; +enum kprobe_slot_state { + SLOT_CLEAN = 0, + SLOT_DIRTY = 1, + SLOT_USED = 2, +}; + static struct hlist_head kprobe_insn_pages; static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -130,8 +136,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { - if (!kip->slot_used[i]) { - kip->slot_used[i] = 1; + if (kip->slot_used[i] == SLOT_CLEAN) { + kip->slot_used[i] = SLOT_USED; kip->nused++; return kip->insns + (i * MAX_INSN_SIZE); } @@ -163,8 +169,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) } INIT_HLIST_NODE(&kip->hlist); hlist_add_head(&kip->hlist, &kprobe_insn_pages); - memset(kip->slot_used, 0, INSNS_PER_PAGE); - kip->slot_used[0] = 1; + memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); + kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; return kip->insns; @@ -173,7 +179,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) /* Return 1 if all garbages are collected, otherwise 0. */ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) { - kip->slot_used[idx] = 0; + kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; if (kip->nused == 0) { /* @@ -212,7 +218,7 @@ static int __kprobes collect_garbage_slots(void) continue; kip->ngarbage = 0; /* we will collect all garbages */ for (i = 0; i < INSNS_PER_PAGE; i++) { - if (kip->slot_used[i] == -1 && + if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } @@ -232,7 +238,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; if (dirty) { - kip->slot_used[i] = -1; + kip->slot_used[i] = SLOT_DIRTY; kip->ngarbage++; } else { collect_one_slot(kip, i); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b020324..509efd4 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -43,13 +43,49 @@ #include "lockdep_internals.h" /* - * hash_lock: protects the lockdep hashes and class/list/hash allocators. + * lockdep_lock: protects the lockdep graph, the hashes and the + * class/list/hash allocators. * * This is one of the rare exceptions where it's justified * to use a raw spinlock - we really dont want the spinlock - * code to recurse back into the lockdep code. + * code to recurse back into the lockdep code... */ -static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static int graph_lock(void) +{ + __raw_spin_lock(&lockdep_lock); + /* + * Make sure that if another CPU detected a bug while + * walking the graph we dont change it (while the other + * CPU is busy printing out stuff with the graph lock + * dropped already) + */ + if (!debug_locks) { + __raw_spin_unlock(&lockdep_lock); + return 0; + } + return 1; +} + +static inline int graph_unlock(void) +{ + __raw_spin_unlock(&lockdep_lock); + return 0; +} + +/* + * Turn lock debugging off and return with 0 if it was off already, + * and also release the graph lock: + */ +static inline int debug_locks_off_graph_unlock(void) +{ + int ret = debug_locks_off(); + + __raw_spin_unlock(&lockdep_lock); + + return ret; +} static int lockdep_initialized; @@ -57,14 +93,15 @@ unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; /* - * Allocate a lockdep entry. (assumes hash_lock held, returns + * Allocate a lockdep entry. (assumes the graph_lock held, returns * with NULL on failure) */ static struct lock_list *alloc_list_entry(void) { if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { - __raw_spin_unlock(&hash_lock); - debug_locks_off(); + if (!debug_locks_off_graph_unlock()) + return NULL; + printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); printk("turning off the locking correctness validator.\n"); return NULL; @@ -145,9 +182,7 @@ EXPORT_SYMBOL(lockdep_on); */ #define VERBOSE 0 -#ifdef VERBOSE -# define VERY_VERBOSE 0 -#endif +#define VERY_VERBOSE 0 #if VERBOSE # define HARDIRQ_VERBOSE 1 @@ -172,8 +207,8 @@ static int class_filter(struct lock_class *class) !strcmp(class->name, "&struct->lockfield")) return 1; #endif - /* Allow everything else. 0 would be filter everything else */ - return 1; + /* Filter everything else. 1 would be to allow everything else */ + return 0; } #endif @@ -207,7 +242,7 @@ static int softirq_verbose(struct lock_class *class) /* * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the hash_lock. + * addresses. Protected by the graph_lock. */ unsigned long nr_stack_trace_entries; static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; @@ -226,18 +261,15 @@ static int save_trace(struct stack_trace *trace) trace->max_entries = trace->nr_entries; nr_stack_trace_entries += trace->nr_entries; - if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) { - __raw_spin_unlock(&hash_lock); - return 0; - } if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { - __raw_spin_unlock(&hash_lock); - if (debug_locks_off()) { - printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - } + if (!debug_locks_off_graph_unlock()) + return 0; + + printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + return 0; } @@ -526,9 +558,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) { struct task_struct *curr = current; - __raw_spin_unlock(&hash_lock); - debug_locks_off(); - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=======================================================\n"); @@ -556,12 +586,10 @@ static noinline int print_circular_bug_tail(void) if (debug_locks_silent) return 0; - /* hash_lock unlocked by the header */ - __raw_spin_lock(&hash_lock); this.class = check_source->class; if (!save_trace(&this.trace)) return 0; - __raw_spin_unlock(&hash_lock); + print_circular_bug_entry(&this, 0); printk("\nother info that might help us debug this:\n\n"); @@ -577,8 +605,10 @@ static noinline int print_circular_bug_tail(void) static int noinline print_infinite_recursion_bug(void) { - __raw_spin_unlock(&hash_lock); - DEBUG_LOCKS_WARN_ON(1); + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); return 0; } @@ -713,9 +743,7 @@ print_bad_irq_dependency(struct task_struct *curr, enum lock_usage_bit bit2, const char *irqclass) { - __raw_spin_unlock(&hash_lock); - debug_locks_off(); - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n======================================================\n"); @@ -796,9 +824,7 @@ static int print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { - debug_locks_off(); - __raw_spin_unlock(&hash_lock); - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=============================================\n"); @@ -974,14 +1000,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(prev->class) || verbose(next->class)) { - __raw_spin_unlock(&hash_lock); + graph_unlock(); printk("\n new dependency: "); print_lock_name(prev->class); printk(" => "); print_lock_name(next->class); printk("\n"); dump_stack(); - __raw_spin_lock(&hash_lock); + return graph_lock(); } return 1; } @@ -1046,8 +1072,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) } return 1; out_bug: - __raw_spin_unlock(&hash_lock); - DEBUG_LOCKS_WARN_ON(1); + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); return 0; } @@ -1201,7 +1229,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) hash_head = classhashentry(key); raw_local_irq_save(flags); - __raw_spin_lock(&hash_lock); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } /* * We have to do the hash-walk again, to avoid races * with another CPU: @@ -1214,9 +1245,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * the hash: */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { - __raw_spin_unlock(&hash_lock); + if (!debug_locks_off_graph_unlock()) { + raw_local_irq_restore(flags); + return NULL; + } raw_local_irq_restore(flags); - debug_locks_off(); + printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); printk("turning off the locking correctness validator.\n"); return NULL; @@ -1237,18 +1271,23 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) list_add_tail_rcu(&class->hash_entry, hash_head); if (verbose(class)) { - __raw_spin_unlock(&hash_lock); + graph_unlock(); raw_local_irq_restore(flags); + printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) printk("#%d", class->name_version); printk("\n"); dump_stack(); + raw_local_irq_save(flags); - __raw_spin_lock(&hash_lock); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } } out_unlock_set: - __raw_spin_unlock(&hash_lock); + graph_unlock(); raw_local_irq_restore(flags); if (!subclass || force) @@ -1264,7 +1303,7 @@ out_unlock_set: * add it and return 0 - in this case the new dependency chain is * validated. If the key is already hashed, return 1. */ -static inline int lookup_chain_cache(u64 chain_key) +static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) { struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; @@ -1278,34 +1317,36 @@ static inline int lookup_chain_cache(u64 chain_key) if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(&chain_lookup_hits); - /* - * In the debugging case, force redundant checking - * by returning 1: - */ -#ifdef CONFIG_DEBUG_LOCKDEP - __raw_spin_lock(&hash_lock); - return 1; -#endif + if (very_verbose(class)) + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, + class->key, class->name); return 0; } } + if (very_verbose(class)) + printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, class->key, class->name); /* * Allocate a new chain entry from the static array, and add * it to the hash: */ - __raw_spin_lock(&hash_lock); + if (!graph_lock()) + return 0; /* * We have to walk the chain again locked - to avoid duplicates: */ list_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { - __raw_spin_unlock(&hash_lock); + graph_unlock(); goto cache_hit; } } if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { - __raw_spin_unlock(&hash_lock); - debug_locks_off(); + if (!debug_locks_off_graph_unlock()) + return 0; + printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); printk("turning off the locking correctness validator.\n"); return 0; @@ -1381,9 +1422,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, struct held_lock *this, int forwards, const char *irqclass) { - __raw_spin_unlock(&hash_lock); - debug_locks_off(); - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=========================================================\n"); @@ -1453,7 +1492,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); } -static inline void print_irqtrace_events(struct task_struct *curr) +void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); @@ -1466,19 +1505,13 @@ static inline void print_irqtrace_events(struct task_struct *curr) print_ip_sym(curr->softirq_disable_ip); } -#else -static inline void print_irqtrace_events(struct task_struct *curr) -{ -} #endif static int print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { - __raw_spin_unlock(&hash_lock); - debug_locks_off(); - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; printk("\n=================================\n"); @@ -1539,12 +1572,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (likely(this->class->usage_mask & new_mask)) return 1; - __raw_spin_lock(&hash_lock); + if (!graph_lock()) + return 0; /* * Make sure we didnt race: */ if (unlikely(this->class->usage_mask & new_mask)) { - __raw_spin_unlock(&hash_lock); + graph_unlock(); return 1; } @@ -1730,16 +1764,16 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, debug_atomic_dec(&nr_unused_locks); break; default: - __raw_spin_unlock(&hash_lock); - debug_locks_off(); + if (!debug_locks_off_graph_unlock()) + return 0; WARN_ON(1); return 0; } - __raw_spin_unlock(&hash_lock); + graph_unlock(); /* - * We must printk outside of the hash_lock: + * We must printk outside of the graph_lock: */ if (ret == 2) { printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); @@ -2137,9 +2171,9 @@ out_calc_hash: * We look up the chain_key and do the O(N^2) check and update of * the dependencies only if this is a new dependency chain. * (If lookup_chain_cache() returns with 1 it acquires - * hash_lock for us) + * graph_lock for us) */ - if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { + if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) { /* * Check whether last held lock: * @@ -2170,7 +2204,7 @@ out_calc_hash: if (!chain_head && ret != 2) if (!check_prevs_add(curr, hlock)) return 0; - __raw_spin_unlock(&hash_lock); + graph_unlock(); } curr->lockdep_depth++; check_chain_key(curr); @@ -2433,6 +2467,7 @@ EXPORT_SYMBOL_GPL(lock_release); void lockdep_reset(void) { unsigned long flags; + int i; raw_local_irq_save(flags); current->curr_chain_key = 0; @@ -2443,6 +2478,8 @@ void lockdep_reset(void) nr_softirq_chains = 0; nr_process_chains = 0; debug_locks = 1; + for (i = 0; i < CHAINHASH_SIZE; i++) + INIT_LIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -2479,7 +2516,7 @@ void lockdep_free_key_range(void *start, unsigned long size) int i; raw_local_irq_save(flags); - __raw_spin_lock(&hash_lock); + graph_lock(); /* * Unhash all classes that were created by this module: @@ -2493,7 +2530,7 @@ void lockdep_free_key_range(void *start, unsigned long size) zap_class(class); } - __raw_spin_unlock(&hash_lock); + graph_unlock(); raw_local_irq_restore(flags); } @@ -2521,20 +2558,20 @@ void lockdep_reset_lock(struct lockdep_map *lock) * Debug check: in the end all mapped classes should * be gone. */ - __raw_spin_lock(&hash_lock); + graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { if (unlikely(class == lock->class_cache)) { - __raw_spin_unlock(&hash_lock); - DEBUG_LOCKS_WARN_ON(1); + if (debug_locks_off_graph_unlock()) + WARN_ON(1); goto out_restore; } } } - __raw_spin_unlock(&hash_lock); + graph_unlock(); out_restore: raw_local_irq_restore(flags); diff --git a/kernel/module.c b/kernel/module.c index d9eae45..8a94e05 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -537,6 +537,8 @@ static int already_uses(struct module *a, struct module *b) static int use_module(struct module *a, struct module *b) { struct module_use *use; + int no_warn; + if (b == NULL || already_uses(a, b)) return 1; if (!strong_try_module_get(b)) @@ -552,6 +554,7 @@ static int use_module(struct module *a, struct module *b) use->module_which_uses = a; list_add(&use->list, &b->modules_which_use_me); + no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 1; } @@ -569,6 +572,7 @@ static void module_unload_free(struct module *mod) module_put(i); list_del(&use->list); kfree(use); + sysfs_remove_link(i->holders_dir, mod->name); /* There can be at most one match. */ break; } @@ -824,9 +828,34 @@ static inline void module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ +static ssize_t show_initstate(struct module_attribute *mattr, + struct module *mod, char *buffer) +{ + const char *state = "unknown"; + + switch (mod->state) { + case MODULE_STATE_LIVE: + state = "live"; + break; + case MODULE_STATE_COMING: + state = "coming"; + break; + case MODULE_STATE_GOING: + state = "going"; + break; + } + return sprintf(buffer, "%s\n", state); +} + +static struct module_attribute initstate = { + .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE }, + .show = show_initstate, +}; + static struct module_attribute *modinfo_attrs[] = { &modinfo_version, &modinfo_srcversion, + &initstate, #ifdef CONFIG_MODULE_UNLOAD &refcnt, #endif @@ -1081,9 +1110,7 @@ static void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } -static int mod_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params) +static int mod_sysfs_init(struct module *mod) { int err; @@ -1100,19 +1127,30 @@ static int mod_sysfs_setup(struct module *mod, kobj_set_kset_s(&mod->mkobj, module_subsys); mod->mkobj.mod = mod; - /* delay uevent until full sysfs population */ kobject_init(&mod->mkobj.kobj); + +out: + return err; +} + +static int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + /* delay uevent until full sysfs population */ err = kobject_add(&mod->mkobj.kobj); if (err) goto out; - mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); - if (!mod->drivers_dir) + mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders"); + if (!mod->holders_dir) goto out_unreg; err = module_param_sysfs_setup(mod, kparam, num_params); if (err) - goto out_unreg_drivers; + goto out_unreg_holders; err = module_add_modinfo_attrs(mod); if (err) @@ -1121,10 +1159,10 @@ static int mod_sysfs_setup(struct module *mod, kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; -out_unreg_drivers: - kobject_unregister(mod->drivers_dir); out_unreg_param: module_param_sysfs_remove(mod); +out_unreg_holders: + kobject_unregister(mod->holders_dir); out_unreg: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); @@ -1136,7 +1174,10 @@ static void mod_kobject_remove(struct module *mod) { module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); - kobject_unregister(mod->drivers_dir); + if (mod->mkobj.drivers_dir) + kobject_unregister(mod->mkobj.drivers_dir); + if (mod->holders_dir) + kobject_unregister(mod->holders_dir); kobject_unregister(&mod->mkobj.kobj); } @@ -1741,6 +1782,10 @@ static struct module *load_module(void __user *umod, /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); + /* Initialize kobject, so we can reference it. */ + if (mod_sysfs_init(mod) != 0) + goto cleanup; + /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); @@ -2300,26 +2345,78 @@ void print_modules(void) printk("\n"); } +static char *make_driver_name(struct device_driver *drv) +{ + char *driver_name; + + driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2, + GFP_KERNEL); + if (!driver_name) + return NULL; + + sprintf(driver_name, "%s:%s", drv->bus->name, drv->name); + return driver_name; +} + +static void module_create_drivers_dir(struct module_kobject *mk) +{ + if (!mk || mk->drivers_dir) + return; + + mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers"); +} + void module_add_driver(struct module *mod, struct device_driver *drv) { + char *driver_name; int no_warn; + struct module_kobject *mk = NULL; + + if (!drv) + return; + + if (mod) + mk = &mod->mkobj; + else if (drv->mod_name) { + struct kobject *mkobj; + + /* Lookup built-in module entry in /sys/modules */ + mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name); + if (mkobj) + mk = container_of(mkobj, struct module_kobject, kobj); + } - if (!mod || !drv) + if (!mk) return; /* Don't check return codes; these calls are idempotent */ - no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); - no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name); + no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module"); + driver_name = make_driver_name(drv); + if (driver_name) { + module_create_drivers_dir(mk); + no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj, + driver_name); + kfree(driver_name); + } } EXPORT_SYMBOL(module_add_driver); void module_remove_driver(struct device_driver *drv) { + char *driver_name; + if (!drv) return; + sysfs_remove_link(&drv->kobj, "module"); - if (drv->owner && drv->owner->drivers_dir) - sysfs_remove_link(drv->owner->drivers_dir, drv->name); + if (drv->owner && drv->owner->mkobj.drivers_dir) { + driver_name = make_driver_name(drv); + if (driver_name) { + sysfs_remove_link(drv->owner->mkobj.drivers_dir, + driver_name); + kfree(driver_name); + } + } } EXPORT_SYMBOL(module_remove_driver); diff --git a/kernel/mutex.c b/kernel/mutex.c index 8c71cf7..e7cbbb8 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_nested); + +int __sched +mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); +} + +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); #endif /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 674aceb..f5b9ee6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -17,8 +17,9 @@ #include <linux/version.h> #include <linux/nsproxy.h> #include <linux/init_task.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/utsname.h> +#include <linux/pid_namespace.h> struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -60,12 +61,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) struct nsproxy *ns = clone_namespaces(orig); if (ns) { - if (ns->namespace) - get_namespace(ns->namespace); + if (ns->mnt_ns) + get_mnt_ns(ns->mnt_ns); if (ns->uts_ns) get_uts_ns(ns->uts_ns); if (ns->ipc_ns) get_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + get_pid_ns(ns->pid_ns); } return ns; @@ -97,7 +100,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) tsk->nsproxy = new_ns; - err = copy_namespace(flags, tsk); + err = copy_mnt_ns(flags, tsk); if (err) goto out_ns; @@ -109,16 +112,23 @@ int copy_namespaces(int flags, struct task_struct *tsk) if (err) goto out_ipc; + err = copy_pid_ns(flags, tsk); + if (err) + goto out_pid; + out: put_nsproxy(old_ns); return err; +out_pid: + if (new_ns->ipc_ns) + put_ipc_ns(new_ns->ipc_ns); out_ipc: if (new_ns->uts_ns) put_uts_ns(new_ns->uts_ns); out_uts: - if (new_ns->namespace) - put_namespace(new_ns->namespace); + if (new_ns->mnt_ns) + put_mnt_ns(new_ns->mnt_ns); out_ns: tsk->nsproxy = old_ns; kfree(new_ns); @@ -127,11 +137,13 @@ out_ns: void free_nsproxy(struct nsproxy *ns) { - if (ns->namespace) - put_namespace(ns->namespace); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - kfree(ns); + if (ns->mnt_ns) + put_mnt_ns(ns->mnt_ns); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + put_pid_ns(ns->pid_ns); + kfree(ns); } diff --git a/kernel/params.c b/kernel/params.c index f406655..553cf7d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -30,6 +30,8 @@ #define DEBUGP(fmt, a...) #endif +static struct kobj_type module_ktype; + static inline char dash2underscore(char c) { if (c == '-') @@ -143,9 +145,15 @@ int parse_args(const char *name, while (*args) { int ret; + int irq_was_disabled; args = next_arg(args, ¶m, &val); + irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, params, num, unknown); + if (irq_was_disabled && !irqs_disabled()) { + printk(KERN_WARNING "parse_args(): option '%s' enabled " + "irq's!\n", param); + } switch (ret) { case -ENOENT: printk(KERN_ERR "%s: Unknown parameter `%s'\n", @@ -555,14 +563,11 @@ static void __init kernel_param_sysfs_setup(const char *name, mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); kobject_set_name(&mk->kobj, name); - ret = kobject_register(&mk->kobj); + kobject_init(&mk->kobj); + ret = kobject_add(&mk->kobj); BUG_ON(ret < 0); - - /* no need to keep the kobject if no parameter is exported */ - if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { - kobject_unregister(&mk->kobj); - kfree(mk); - } + param_sysfs_setup(mk, kparam, num_params, name_skip); + kobject_uevent(&mk->kobj, KOBJ_ADD); } /* @@ -668,6 +673,19 @@ static struct sysfs_ops module_sysfs_ops = { .store = module_attr_store, }; +static int uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &module_ktype) + return 1; + return 0; +} + +static struct kset_uevent_ops module_uevent_ops = { + .filter = uevent_filter, +}; + #else static struct sysfs_ops module_sysfs_ops = { .show = NULL, @@ -679,7 +697,7 @@ static struct kobj_type module_ktype = { .sysfs_ops = &module_sysfs_ops, }; -decl_subsys(module, &module_ktype, NULL); +decl_subsys(module, &module_ktype, &module_uevent_ops); /* * param_sysfs_init - wrapper for built-in params support diff --git a/kernel/pid.c b/kernel/pid.c index a48879b..78f2aee 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,7 +26,7 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/hash.h> -#include <linux/pspace.h> +#include <linux/pid_namespace.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; @@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT; #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) +static inline int mk_pid(struct pid_namespace *pid_ns, + struct pidmap *map, int off) { - return (map - pspace->pidmap)*BITS_PER_PAGE + off; + return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; } #define find_next_offset(map, off) \ @@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -struct pspace init_pspace = { +struct pid_namespace init_pid_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, - .last_pid = 0 + .last_pid = 0, + .child_reaper = &init_task }; /* @@ -80,25 +85,25 @@ struct pspace init_pspace = { static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(struct pspace *pspace, int pid) +static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) { - struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; + struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(struct pspace *pspace) +static int alloc_pidmap(struct pid_namespace *pid_ns) { - int i, offset, max_scan, pid, last = pspace->last_pid; + int i, offset, max_scan, pid, last = pid_ns->last_pid; struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pspace->pidmap[pid/BITS_PER_PAGE]; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { @@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - pspace->last_pid = pid; + pid_ns->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(pspace, map, offset); + pid = mk_pid(pid_ns, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pspace->pidmap[0]; + map = &pid_ns->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(pspace, map, offset); + pid = mk_pid(pid_ns, map, offset); } return -1; } -static int next_pidmap(struct pspace *pspace, int last) +static int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; struct pidmap *map, *end; offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pspace->pidmap[PIDMAP_ENTRIES]; + map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pid_ns->pidmap[PIDMAP_ENTRIES]; for (; map < end; map++, offset = 0) { if (unlikely(!map->page)) continue; offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); if (offset < BITS_PER_PAGE) - return mk_pid(pspace, map, offset); + return mk_pid(pid_ns, map, offset); } return -1; } @@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(&init_pspace, pid->nr); + free_pidmap(&init_pid_ns, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -206,7 +211,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(&init_pspace); + nr = alloc_pidmap(current->nsproxy->pid_ns); if (nr < 0) goto out_free; @@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr) pid = find_pid(nr); if (pid) break; - nr = next_pidmap(&init_pspace, nr); + nr = next_pidmap(current->nsproxy->pid_ns, nr); } while (nr > 0); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); +int copy_pid_ns(int flags, struct task_struct *tsk) +{ + struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_pid_ns(old_ns); + return err; +} + +void free_pid_ns(struct kref *kref) +{ + struct pid_namespace *ns; + + ns = container_of(kref, struct pid_namespace, kref); + kfree(ns); +} + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or @@ -382,10 +407,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pspace.pidmap[0].page); - atomic_dec(&init_pspace.pidmap[0].nr_free); + set_bit(0, init_pid_ns.pidmap[0].page); + atomic_dec(&init_pid_ns.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 710ed08..95f6657 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -20,13 +20,14 @@ config PM sending the processor to sleep and saving power. config PM_LEGACY - bool "Legacy Power Management API" + bool "Legacy Power Management API (DEPRECATED)" depends on PM - default y + default n ---help--- - Support for pm_register() and friends. + Support for pm_register() and friends. This old API is obsoleted + by the driver model. - If unsure, say Y. + If unsure, say N. config PM_DEBUG bool "Power Management Debug Support" @@ -130,3 +131,29 @@ config SUSPEND_SMP bool depends on HOTPLUG_CPU && X86 && PM default y + +config APM_EMULATION + tristate "Advanced Power Management Emulation" + depends on PM && SYS_SUPPORTS_APM_EMULATION + help + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with + APM compliant BIOSes. If you say Y here, the system time will be + reset after a RESUME operation, the /proc/apm device will provide + battery status information, and user-space programs will receive + notification of APM "events" (e.g. battery status change). + + In order to use APM, you will need supporting software. For location + and more information, read <file:Documentation/pm.txt> and the + Battery Powered Linux mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + This driver does not spin down disk drives (see the hdparm(8) + manpage ("man 8 hdparm") for that), and it doesn't turn off + VESA-compliant "green" monitors. + + Generally, if you don't have a battery in your machine, there isn't + much point in using this driver and you should say N. If you get + random kernel OOPSes or reboots that don't seem to be related to + anything, try disabling/enabling this option (or disabling/enabling + APM in your BIOS). diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0b00f56..88fc5d7 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -60,9 +60,11 @@ static void power_down(suspend_disk_method_t mode) { switch(mode) { case PM_DISK_PLATFORM: - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - pm_ops->enter(PM_SUSPEND_DISK); - break; + if (pm_ops && pm_ops->enter) { + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + pm_ops->enter(PM_SUSPEND_DISK); + break; + } case PM_DISK_SHUTDOWN: kernel_power_off(); break; diff --git a/kernel/power/main.c b/kernel/power/main.c index 500eb87..ff3a618 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -29,7 +29,7 @@ DEFINE_MUTEX(pm_mutex); struct pm_ops *pm_ops; -suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; +suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM; /** * pm_set_ops - Set the global power method table. diff --git a/kernel/power/process.c b/kernel/power/process.c index 99eeb11..6d566bf 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -28,8 +28,7 @@ static inline int freezeable(struct task_struct * p) if ((p == current) || (p->flags & PF_NOFREEZE) || (p->exit_state == EXIT_ZOMBIE) || - (p->exit_state == EXIT_DEAD) || - (p->state == TASK_STOPPED)) + (p->exit_state == EXIT_DEAD)) return 0; return 1; } @@ -61,10 +60,16 @@ static inline void freeze_process(struct task_struct *p) unsigned long flags; if (!freezing(p)) { - freeze(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + rmb(); + if (!frozen(p)) { + if (p->state == TASK_STOPPED) + force_sig_specific(SIGSTOP, p); + + freeze(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, p->state == TASK_STOPPED); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } } } @@ -103,9 +108,7 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) if (frozen(p)) continue; - if (p->state == TASK_TRACED && - (frozen(p->parent) || - p->parent->state == TASK_STOPPED)) { + if (p->state == TASK_TRACED && frozen(p->parent)) { cancel_freezing(p); continue; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f133d4a..3581f8f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -165,14 +165,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */ { int res; - res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + res = swap_type_of(swsusp_resume_device, swsusp_resume_block, + &resume_bdev); if (res < 0) return res; root_swap = res; - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE); - if (IS_ERR(resume_bdev)) - return PTR_ERR(resume_bdev); + res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); + if (res) + return res; res = set_blocksize(resume_bdev, PAGE_SIZE); if (res < 0) diff --git a/kernel/power/user.c b/kernel/power/user.c index 89443b8..f7b7a78 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -57,7 +57,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0) : -1; + swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; } else { data->swap = -1; @@ -268,7 +268,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, * so we need to recode them */ if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg), 0); + data->swap = swap_type_of(old_decode_dev(arg), + 0, NULL); if (data->swap < 0) error = -ENODEV; } else { @@ -365,7 +366,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, swdev = old_decode_dev(swap_area.dev); if (swdev) { offset = swap_area.offset; - data->swap = swap_type_of(swdev, offset); + data->swap = swap_type_of(swdev, offset, NULL); if (data->swap < 0) error = -ENODEV; } else { diff --git a/kernel/printk.c b/kernel/printk.c index 185bb45..c770e1a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -335,7 +335,7 @@ static void __call_console_drivers(unsigned long start, unsigned long end) static int __read_mostly ignore_loglevel; -int __init ignore_loglevel_setup(char *str) +static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = 1; printk(KERN_INFO "debug: ignoring loglevel setting.\n"); diff --git a/kernel/profile.c b/kernel/profile.c index fb5e03d..d6579d5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -40,7 +40,10 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; + int prof_on __read_mostly; +EXPORT_SYMBOL_GPL(prof_on); + static cpumask_t prof_cpu_mask = CPU_MASK_ALL; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); @@ -52,6 +55,7 @@ static int __init profile_setup(char * str) { static char __initdata schedstr[] = "schedule"; static char __initdata sleepstr[] = "sleep"; + static char __initdata kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -63,7 +67,7 @@ static int __init profile_setup(char * str) printk(KERN_INFO "kernel sleep profiling enabled (shift: %ld)\n", prof_shift); - } else if (!strncmp(str, sleepstr, strlen(sleepstr))) { + } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; @@ -72,6 +76,15 @@ static int __init profile_setup(char * str) printk(KERN_INFO "kernel schedule profiling enabled (shift: %ld)\n", prof_shift); + } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { + prof_on = KVM_PROFILING; + if (str[strlen(kvmstr)] == ',') + str += strlen(kvmstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel KVM profiling enabled (shift: %ld)\n", + prof_shift); } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; @@ -387,6 +400,8 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits) } #endif /* !CONFIG_SMP */ +EXPORT_SYMBOL_GPL(profile_hits); + void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c52f981..482b11f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -522,6 +522,7 @@ rcu_torture_writer(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); set_user_nice(current, 19); + current->flags |= PF_NOFREEZE; do { schedule_timeout_uninterruptible(1); @@ -561,6 +562,7 @@ rcu_torture_fakewriter(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, 19); + current->flags |= PF_NOFREEZE; do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); @@ -591,6 +593,7 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); + current->flags |= PF_NOFREEZE; do { idx = cur_ops->readlock(); diff --git a/kernel/relay.c b/kernel/relay.c index 75a3a9a..284e2e8 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -138,7 +138,7 @@ depopulate: */ struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL); + struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); if (!buf) return NULL; @@ -302,7 +302,7 @@ static struct rchan_callbacks default_channel_callbacks = { /** * wakeup_readers - wake up readers waiting on a channel - * @private: the channel buffer + * @work: work struct that contains the the channel buffer * * This is the work function used to defer reader waking. The * reason waking is deferred is that calling directly from write @@ -322,7 +322,7 @@ static void wakeup_readers(struct work_struct *work) * * See relay_reset for description of effect. */ -static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) +static void __relay_reset(struct rchan_buf *buf, unsigned int init) { size_t i; @@ -418,7 +418,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, * The channel buffer and channel buffer data structure are then freed * automatically when the last reference is given up. */ -static inline void relay_close_buf(struct rchan_buf *buf) +static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; cancel_delayed_work(&buf->wake_readers); @@ -426,7 +426,7 @@ static inline void relay_close_buf(struct rchan_buf *buf) kref_put(&buf->kref, relay_remove_buf); } -static inline void setup_callbacks(struct rchan *chan, +static void setup_callbacks(struct rchan *chan, struct rchan_callbacks *cb) { if (!cb) { @@ -479,7 +479,7 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; - chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL); + chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) return NULL; @@ -946,11 +946,10 @@ typedef int (*subbuf_actor_t) (size_t read_start, /* * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries */ -static inline ssize_t relay_file_read_subbufs(struct file *filp, - loff_t *ppos, - subbuf_actor_t subbuf_actor, - read_actor_t actor, - read_descriptor_t *desc) +static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, + subbuf_actor_t subbuf_actor, + read_actor_t actor, + read_descriptor_t *desc) { struct rchan_buf *buf = filp->private_data; size_t read_start, avail; @@ -959,7 +958,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, if (!desc->count) return 0; - mutex_lock(&filp->f_dentry->d_inode->i_mutex); + mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -979,7 +978,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&filp->f_dentry->d_inode->i_mutex); + mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); return desc->written; } diff --git a/kernel/resource.c b/kernel/resource.c index 7b9a497..2a3f886 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/device.h> #include <asm/io.h> @@ -618,6 +619,67 @@ void __release_region(struct resource *parent, resource_size_t start, EXPORT_SYMBOL(__release_region); /* + * Managed region resource + */ +struct region_devres { + struct resource *parent; + resource_size_t start; + resource_size_t n; +}; + +static void devm_region_release(struct device *dev, void *res) +{ + struct region_devres *this = res; + + __release_region(this->parent, this->start, this->n); +} + +static int devm_region_match(struct device *dev, void *res, void *match_data) +{ + struct region_devres *this = res, *match = match_data; + + return this->parent == match->parent && + this->start == match->start && this->n == match->n; +} + +struct resource * __devm_request_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n, const char *name) +{ + struct region_devres *dr = NULL; + struct resource *res; + + dr = devres_alloc(devm_region_release, sizeof(struct region_devres), + GFP_KERNEL); + if (!dr) + return NULL; + + dr->parent = parent; + dr->start = start; + dr->n = n; + + res = __request_region(parent, start, n, name); + if (res) + devres_add(dev, dr); + else + devres_free(dr); + + return res; +} +EXPORT_SYMBOL(__devm_request_region); + +void __devm_release_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n) +{ + struct region_devres match_data = { parent, start, n }; + + __release_region(parent, start, n); + WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + &match_data)); +} +EXPORT_SYMBOL(__devm_release_region); + +/* * Called from init/main.c to reserve IO ports. */ #define MAXRESERVE 4 diff --git a/kernel/sched.c b/kernel/sched.c index f385eff..cca93cc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -225,8 +225,10 @@ struct rq { unsigned long nr_uninterruptible; unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; + unsigned long next_balance; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; @@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 12 +#define SCHEDSTAT_VERSION 14 static int show_schedstat(struct seq_file *seq, void *v) { @@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); } preempt_enable(); #endif @@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) #endif /* - * rq_lock - lock a given runqueue and disable interrupts. + * this_rq_lock - lock this runqueue and disable interrupts. */ static inline struct rq *this_rq_lock(void) __acquires(rq->lock) @@ -938,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; + if (rt_task(p)) + goto out; + now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ struct rq *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; } #endif @@ -959,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) (now - p->timestamp) >> 20); } - if (!rt_task(p)) - p->prio = recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -985,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) } } p->timestamp = now; - +out: __activate_task(p, rq); } @@ -1450,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; - unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + unsigned long tl_per_task; + + tl_per_task = cpu_avg_load_per_task(this_cpu); /* * If sync wakeup then subtract the (maximum possible) @@ -1558,6 +1567,7 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1618,7 +1628,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) * runqueue lock is not a problem. */ current->time_slice = 1; - scheduler_tick(); + task_running_tick(cpu_rq(cpu), current); } local_irq_enable(); put_cpu(); @@ -1688,8 +1698,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1952,6 +1962,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { + BUG_ON(!irqs_disabled()); if (rq1 == rq2) { spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ @@ -1991,6 +2002,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -2061,8 +2077,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2098,10 +2114,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->most_recent_timestamp, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif return 1; + } - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, rq->most_recent_timestamp, sd)) return 0; return 1; } @@ -2199,11 +2220,6 @@ skip_queue: goto skip_bitmap; } -#ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2241,7 +2257,7 @@ out: static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus) + cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2270,10 +2286,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long load, group_capacity; int local_group; int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); + if (local_group) + balance_cpu = first_cpu(group->cpumask); + /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -2289,9 +2309,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, *sd_idle = 0; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + load = target_load(i, load_idx); - else + } else load = source_load(i, load_idx); avg_load += load; @@ -2299,6 +2324,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load += rq->raw_weighted_load; } + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. + */ + if (local_group && balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + total_load += avg_load; total_pwr += group->cpu_power; @@ -2458,18 +2493,21 @@ small_imbalance: pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + busiest->cpu_power; if (max_load > tmp) pwr_move += busiest->cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load*busiest->cpu_power < - busiest_load_per_task*SCHED_LOAD_SCALE) - tmp = max_load*busiest->cpu_power/this->cpu_power; + if (max_load * busiest->cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = max_load * busiest->cpu_power / this->cpu_power; else - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + this->cpu_power; + pwr_move += this->cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2490,8 +2528,8 @@ out_balanced: *imbalance = min_load_per_task; return group_min; } -ret: #endif +ret: *imbalance = 0; return NULL; } @@ -2540,17 +2578,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. - * - * Called with this_rq unlocked. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, + int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + unsigned long flags; /* * When power savings policy is enabled for the parent domain, idle @@ -2566,7 +2604,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus); + &cpus, balance); + + if (*balance == 0) + goto out_balanced; + if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2590,11 +2632,13 @@ redo: * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ + local_irq_save(flags); double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { @@ -2611,13 +2655,13 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock(&busiest->lock); + spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } @@ -2627,7 +2671,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -2706,7 +2750,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, - &sd_idle, &cpus); + &sd_idle, &cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2766,14 +2810,28 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ - if (load_balance_newidle(this_cpu, this_rq, sd)) + pulled_task = load_balance_newidle(this_cpu, + this_rq, sd); + if (time_after(next_balance, + sd->last_balance + sd->balance_interval)) + next_balance = sd->last_balance + + sd->balance_interval; + if (pulled_task) break; } } + if (!pulled_task) + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; } /* @@ -2826,26 +2884,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } -/* - * rebalance_tick will get called every timer tick, on every CPU. - * - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ - -/* Don't have all balancing operations going off at once: */ -static inline unsigned long cpu_offset(int cpu) -{ - return jiffies + cpu * HZ / NR_CPUS; -} - -static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +static void update_load(struct rq *this_rq) { - unsigned long this_load, interval, j = cpu_offset(this_cpu); - struct sched_domain *sd; + unsigned long this_load; int i, scale; this_load = this_rq->raw_weighted_load; @@ -2865,6 +2906,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; } +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static DEFINE_SPINLOCK(balancing); + +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(), balance = 1; + struct rq *this_rq = cpu_rq(this_cpu); + unsigned long interval; + struct sched_domain *sd; + /* + * We are idle if there are no processes running. This + * is valid even if we are the idle process (SMT). + */ + enum idle_type idle = !this_rq->nr_running ? + SCHED_IDLE : NOT_IDLE; + /* Earliest time when we have to call run_rebalance_domains again */ + unsigned long next_balance = jiffies + 60*HZ; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2879,8 +2946,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) if (unlikely(!interval)) interval = 1; - if (j - sd->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, sd, idle)) { + if (sd->flags & SD_SERIALIZE) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2888,39 +2960,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) */ idle = NOT_IDLE; } - sd->last_balance += interval; + sd->last_balance = jiffies; } + if (sd->flags & SD_SERIALIZE) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; } + this_rq->next_balance = next_balance; } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) -{ -} static inline void idle_balance(int cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline void wake_priority_sleeper(struct rq *rq) { - int ret = 0; - #ifdef CONFIG_SCHED_SMT + if (!rq->nr_running) + return; + spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ - if (rq->nr_running) { + if (rq->nr_running) resched_task(rq->idle); - ret = 1; - } spin_unlock(&rq->lock); #endif - return ret; } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -2934,7 +3015,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; } /* @@ -2947,8 +3029,7 @@ unsigned long long current_sched_time(const struct task_struct *p) unsigned long flags; local_irq_save(flags); - ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); - ns = p->sched_time + sched_clock() - ns; + ns = p->sched_time + sched_clock() - p->last_ran; local_irq_restore(flags); return ns; @@ -3048,35 +3129,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) +static void task_running_tick(struct rq *rq, struct task_struct *p) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - update_cpu_clock(p, rq, now); - - rq->timestamp_last_tick = now; - - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); - return; - } - - /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { + /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); - goto out; + return; } spin_lock(&rq->lock); /* @@ -3144,8 +3202,34 @@ void scheduler_tick(void) } out_unlock: spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + unsigned long long now = sched_clock(); + struct task_struct *p = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + update_cpu_clock(p, rq, now); + + if (p == rq->idle) + /* Task on the idle queue */ + wake_priority_sleeper(rq); + else + task_running_tick(rq, p); +#ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); +#endif } #ifdef CONFIG_SCHED_SMT @@ -3291,7 +3375,8 @@ void fastcall add_preempt_count(int val) /* * Spinlock count overflowing soon? */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); } EXPORT_SYMBOL(add_preempt_count); @@ -3345,6 +3430,8 @@ asmlinkage void __sched schedule(void) "%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid); debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4530,15 +4617,6 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline int __resched_legal(int expected_preempt_count) -{ - if (unlikely(preempt_count() != expected_preempt_count)) - return 0; - if (unlikely(system_state != SYSTEM_RUNNING)) - return 0; - return 1; -} - static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -4558,7 +4636,8 @@ static void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched() && __resched_legal(0)) { + if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && + system_state == SYSTEM_RUNNING) { __cond_resched(); return 1; } @@ -4584,7 +4663,7 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched() && __resched_legal(1)) { + if (need_resched() && system_state == SYSTEM_RUNNING) { spin_release(&lock->dep_map, 1, _THIS_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); @@ -4600,7 +4679,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && __resched_legal(0)) { + if (need_resched() && system_state == SYSTEM_RUNNING) { raw_local_irq_disable(); _local_bh_enable(); raw_local_irq_enable(); @@ -4990,8 +5069,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -5067,7 +5146,10 @@ wait_to_die: } #ifdef CONFIG_HOTPLUG_CPU -/* Figure out where task on dead CPU should go, use force if neccessary. */ +/* + * Figure out where task on dead CPU should go, use force if neccessary. + * NOTE: interrupts should be disabled by the caller + */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; @@ -5187,6 +5269,7 @@ void idle_task_exit(void) mmdrop(mm); } +/* called under rq->lock with disabled interrupts */ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) { struct rq *rq = cpu_rq(dead_cpu); @@ -5203,10 +5286,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * Drop lock around migration; if someone else moves it, * that's OK. No task can be added to this CPU, so iteration is * fine. + * NOTE: interrupts should be left disabled --dev@ */ - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + spin_lock(&rq->lock); put_task_struct(p); } @@ -5359,16 +5443,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); break; } printk("span %s\n", str); if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); printk(KERN_DEBUG); for (i = 0; i < level + 2; i++) @@ -5383,7 +5470,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!group->cpu_power) { printk("\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); } if (!cpus_weight(group->cpumask)) { @@ -5406,15 +5494,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk("\n"); if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span " + "domain->span\n"); level++; sd = sd->parent; + if (!sd) + continue; - if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); - } + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); } while (sd); } @@ -5510,7 +5600,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -5528,28 +5618,27 @@ static int __init isolated_cpu_setup(char *str) __setup ("isolcpus=", isolated_cpu_setup); /* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). + * init_sched_build_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS + * (due to the fact that we keep track of groups covered with a cpumask_t). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(struct sched_group groups[], cpumask_t span, - const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map)) +init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, + int (*group_fn)(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; int i; for_each_cpu_mask(i, span) { - int group = group_fn(i, cpu_map); - struct sched_group *sg = &groups[group]; + struct sched_group *sg; + int group = group_fn(i, cpu_map, &sg); int j; if (cpu_isset(i, covered)) @@ -5559,7 +5648,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span, sg->cpu_power = 0; for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map) != group) + if (group_fn(j, cpu_map, NULL) != group) continue; cpu_set(j, covered); @@ -5733,8 +5822,9 @@ __setup("max_cache_size=", setup_max_cache_size); */ static void touch_cache(void *__cache, unsigned long __size) { - unsigned long size = __size/sizeof(long), chunk1 = size/3, - chunk2 = 2*size/3; + unsigned long size = __size / sizeof(long); + unsigned long chunk1 = size / 3; + unsigned long chunk2 = 2 * size / 3; unsigned long *cache = __cache; int i; @@ -5843,11 +5933,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) */ measure_one(cache, size, cpu1, cpu2); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); measure_one(cache, size, cpu2, cpu1); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); /* * (We measure the non-migrating [cached] cost on both @@ -5857,17 +5947,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) measure_one(cache, size, cpu1, cpu1); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); measure_one(cache, size, cpu2, cpu2); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); /* * Get the per-iteration migration cost: */ - do_div(cost1, 2*ITERATIONS); - do_div(cost2, 2*ITERATIONS); + do_div(cost1, 2 * ITERATIONS); + do_div(cost2, 2 * ITERATIONS); return cost1 - cost2; } @@ -5905,7 +5995,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) */ cache = vmalloc(max_size); if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); return 1000000; /* return 1 msec on very small boxen */ } @@ -5930,7 +6020,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) avg_fluct = (avg_fluct + fluct)/2; if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " + "(%8Ld %8Ld)\n", cpu1, cpu2, size, (long)cost / 1000000, ((long)cost / 100000) % 10, @@ -6025,20 +6116,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - if (system_state == SYSTEM_BOOTING) { - if (num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); } + printk("\n"); } j1 = jiffies; if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0)/HZ); + printk("migration: %ld seconds\n", (j1-j0) / HZ); /* * Move back to the original CPU. NUMA-Q gets confused @@ -6135,10 +6224,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); -static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } #endif @@ -6148,39 +6240,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group sched_group_core[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_core); #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; cpumask_t mask = cpu_sibling_map[cpu]; cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group); + return group; } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_core, cpu); return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_phys); -static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; #ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) cpumask_t mask = cpu_sibling_map[cpu]; cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); #else - return cpu; + group = cpu; #endif + if (sg) + *sg = &per_cpu(sched_group_phys, group); + return group; } #ifdef CONFIG_NUMA @@ -6193,12 +6298,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { - return cpu_to_node(cpu); + cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); + int group; + + cpus_and(nodemask, nodemask, *cpu_map); + group = first_cpu(nodemask); + + if (sg) + *sg = &per_cpu(sched_group_allnodes, group); + return group; } + static void init_numa_sched_groups_power(struct sched_group *group_head) { struct sched_group *sg = group_head; @@ -6234,16 +6349,9 @@ static void free_sched_groups(const cpumask_t *cpu_map) int cpu, i; for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - if (!sched_group_nodes) continue; @@ -6337,7 +6445,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; - struct sched_group *sched_group_allnodes = NULL; + int sd_allnodes = 0; /* * Allocate the per-node list of sched groups @@ -6355,7 +6463,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { - int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); @@ -6364,26 +6471,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { - if (!sched_group_allnodes) { - sched_group_allnodes - = kmalloc_node(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL, - cpu_to_node(i)); - if (!sched_group_allnodes) { - printk(KERN_WARNING - "Can not alloc allnodes sched group\n"); - goto error; - } - sched_group_allnodes_bycpu[i] - = sched_group_allnodes; - } sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i, cpu_map); - sd->groups = &sched_group_allnodes[group]; + cpu_to_allnodes_group(i, cpu_map, &sd->groups); p = sd; + sd_allnodes = 1; } else p = NULL; @@ -6398,36 +6491,33 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i, cpu_map); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; if (p) p->child = sd; - sd->groups = &sched_group_phys[group]; + cpu_to_phys_group(i, cpu_map, &sd->groups); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i, cpu_map); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_core[group]; + cpu_to_core_group(i, cpu_map, &sd->groups); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i, cpu_map); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_cpus[group]; + cpu_to_cpu_group(i, cpu_map, &sd->groups); #endif } @@ -6439,8 +6529,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(sched_group_cpus, this_sibling_map, - cpu_map, &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); } #endif @@ -6451,8 +6540,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(sched_group_core, this_core_map, - cpu_map, &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); } #endif @@ -6465,15 +6553,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(nodemask)) continue; - init_sched_build_groups(sched_group_phys, nodemask, - cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sched_group_allnodes) - init_sched_build_groups(sched_group_allnodes, *cpu_map, - cpu_map, &cpu_to_allnodes_group); + if (sd_allnodes) + init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6565,10 +6651,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); - struct sched_group *sg = &sched_group_allnodes[group]; + if (sd_allnodes) { + struct sched_group *sg; + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); init_numa_sched_groups_power(sg); } #endif @@ -6779,7 +6865,7 @@ void __init sched_init_smp(void) lock_cpu_hotplug(); arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); + cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); unlock_cpu_hotplug(); @@ -6847,6 +6933,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); +#endif + #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); #endif @@ -6882,6 +6972,8 @@ void __might_sleep(char *file, int line) printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); dump_stack(); } #endif diff --git a/kernel/signal.c b/kernel/signal.c index ec81def..5630255 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -24,6 +24,9 @@ #include <linux/signal.h> #include <linux/capability.h> #include <linux/freezer.h> +#include <linux/pid_namespace.h> +#include <linux/nsproxy.h> + #include <asm/param.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -583,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info, error = -EPERM; if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || - (current->signal->session != t->signal->session)) + (process_session(current) != process_session(t))) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) @@ -1702,7 +1705,9 @@ finish_stop(int stop_count) read_unlock(&tasklist_lock); } - schedule(); + do { + schedule(); + } while (try_to_freeze()); /* * Now we don't run again until continued. */ @@ -1877,8 +1882,12 @@ relock: if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; - /* Init gets no signals it doesn't want. */ - if (current == child_reaper) + /* + * Init of a pid space gets no signals it doesn't want from + * within that pid space. It can of course get signals from + * its parent pid space. + */ + if (current == child_reaper(current)) continue; if (sig_kernel_stop(signr)) { diff --git a/kernel/sys.c b/kernel/sys.c index a0c1a29..6e2101d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -323,11 +323,18 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v) { - int ret; + int ret = NOTIFY_DONE; - down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v); - up_read(&nh->rwsem); + /* + * We check the head outside the lock, but if this access is + * racy then it does not matter what the result of the test + * is, we re-check the list after having taken the lock anyway: + */ + if (rcu_dereference(nh->head)) { + down_read(&nh->rwsem); + ret = notifier_call_chain(&nh->head, val, v); + up_read(&nh->rwsem); + } return ret; } @@ -1381,7 +1388,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->real_parent == group_leader) { err = -EPERM; - if (p->signal->session != group_leader->signal->session) + if (process_session(p) != process_session(group_leader)) goto out; err = -EACCES; if (p->did_exec) @@ -1397,16 +1404,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) goto out; if (pgid != pid) { - struct task_struct *p; + struct task_struct *g = + find_task_by_pid_type(PIDTYPE_PGID, pgid); - do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == group_leader->signal->session) - goto ok_pgid; - } while_each_task_pid(pgid, PIDTYPE_PGID, p); - goto out; + if (!g || process_session(g) != process_session(group_leader)) + goto out; } -ok_pgid: err = security_task_setpgid(p, pgid); if (err) goto out; @@ -1459,7 +1463,7 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) - return current->signal->session; + return process_session(current); else { int retval; struct task_struct *p; @@ -1471,7 +1475,7 @@ asmlinkage long sys_getsid(pid_t pid) if (p) { retval = security_task_getsid(p); if (!retval) - retval = p->signal->session; + retval = process_session(p); } read_unlock(&tasklist_lock); return retval; @@ -1484,7 +1488,6 @@ asmlinkage long sys_setsid(void) pid_t session; int err = -EPERM; - mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ @@ -1504,12 +1507,15 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(session, session); + + spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; + spin_unlock(&group_leader->sighand->siglock); + err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - mutex_unlock(&tty_mutex); return err; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8e9f00f..600b333 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,7 +65,6 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; extern int max_threads; -extern int sysrq_enabled; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; @@ -92,7 +91,9 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -131,12 +132,22 @@ extern int max_lock_depth; #ifdef CONFIG_SYSCTL_SYSCALL static int parse_table(int __user *, int, void __user *, size_t __user *, - void __user *, size_t, ctl_table *, void **); + void __user *, size_t, ctl_table *); #endif static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); + +#ifdef CONFIG_SYSVIPC +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -163,6 +174,40 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +static void *get_uts(ctl_table *table, int write) +{ + char *which = table->data; +#ifdef CONFIG_UTS_NS + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + which = (which - (char *)&init_uts_ns) + (char *)uts_ns; +#endif + if (!write) + down_read(&uts_sem); + else + down_write(&uts_sem); + return which; +} + +static void put_uts(ctl_table *table, int write, void *which) +{ + if (!write) + up_read(&uts_sem); + else + up_write(&uts_sem); +} + +#ifdef CONFIG_SYSVIPC +static void *get_ipc(ctl_table *table, int write) +{ + char *which = table->data; + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; + return which; +} +#else +#define get_ipc(T,W) ((T)->data) +#endif + /* /proc declarations: */ #ifdef CONFIG_PROC_SYSCTL @@ -229,7 +274,6 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { -#ifndef CONFIG_UTS_NS { .ctl_name = KERN_OSTYPE, .procname = "ostype", @@ -237,7 +281,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_OSRELEASE, @@ -246,7 +290,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_VERSION, @@ -255,7 +299,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_NODENAME, @@ -264,7 +308,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_DOMAINNAME, @@ -273,56 +317,8 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, -#else /* !CONFIG_UTS_NS */ - { - .ctl_name = KERN_OSTYPE, - .procname = "ostype", - .data = NULL, - /* could maybe use __NEW_UTS_LEN here? */ - .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_OSRELEASE, - .procname = "osrelease", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, release), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_VERSION, - .procname = "version", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, version), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_NODENAME, - .procname = "hostname", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_DOMAINNAME, - .procname = "domainname", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, -#endif /* !CONFIG_UTS_NS */ { .ctl_name = KERN_PANIC, .procname = "panic", @@ -481,65 +477,72 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = NULL, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlmax, + .maxlen = sizeof (init_ipc_ns.shm_ctlmax), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = NULL, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlall, + .maxlen = sizeof (init_ipc_ns.shm_ctlall), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.shm_ctlmni, + .maxlen = sizeof (init_ipc_ns.shm_ctlmni), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmax, + .maxlen = sizeof (init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmni, + .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmnb, + .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = NULL, + .data = &init_ipc_ns.sem_ctls, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, #endif #ifdef CONFIG_MAGIC_SYSRQ { .ctl_name = KERN_SYSRQ, .procname = "sysrq", - .data = &sysrq_enabled, + .data = &__sysrq_enabled, .maxlen = sizeof (int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -1239,7 +1242,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol do { struct ctl_table_header *head = list_entry(tmp, struct ctl_table_header, ctl_entry); - void *context = NULL; if (!use_table(head)) continue; @@ -1247,9 +1249,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol spin_unlock(&sysctl_lock); error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, head->ctl_table, - &context); - kfree(context); + newval, newlen, head->ctl_table); spin_lock(&sysctl_lock); unuse_table(head); @@ -1305,7 +1305,7 @@ static inline int ctl_perm(ctl_table *table, int op) static int parse_table(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, - ctl_table *table, void **context) + ctl_table *table) { int n; repeat: @@ -1325,7 +1325,7 @@ repeat: error = table->strategy( table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (error) return error; } @@ -1336,7 +1336,7 @@ repeat: } error = do_sysctl_strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); return error; } } @@ -1347,7 +1347,7 @@ repeat: int do_sysctl_strategy (ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { int op = 0, rc; size_t len; @@ -1361,7 +1361,7 @@ int do_sysctl_strategy (ctl_table *table, if (table->strategy) { rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1614,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, size_t count, loff_t *ppos) { int op; - struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); + struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); struct ctl_table *table; size_t res; ssize_t error = -ENOTDIR; @@ -1753,66 +1753,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ - -#ifndef CONFIG_UTS_NS -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int r; - if (!write) { - down_read(&uts_sem); - r=proc_dostring(table,0,filp,buffer,lenp, ppos); - up_read(&uts_sem); - } else { - down_write(&uts_sem); - r=proc_dostring(table,1,filp,buffer,lenp, ppos); - up_write(&uts_sem); - } - return r; -} -#else /* !CONFIG_UTS_NS */ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; - struct uts_namespace* uts_ns = current->nsproxy->uts_ns; - char* which; - - switch (table->ctl_name) { - case KERN_OSTYPE: - which = uts_ns->name.sysname; - break; - case KERN_NODENAME: - which = uts_ns->name.nodename; - break; - case KERN_OSRELEASE: - which = uts_ns->name.release; - break; - case KERN_VERSION: - which = uts_ns->name.version; - break; - case KERN_DOMAINNAME: - which = uts_ns->name.domainname; - break; - default: - r = -EINVAL; - goto out; - } - - if (!write) { - down_read(&uts_sem); - r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); - up_read(&uts_sem); - } else { - down_write(&uts_sem); - r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); - up_write(&uts_sem); - } - out: + void *which; + which = get_uts(table, write); + r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); + put_uts(table, write, which); return r; } -#endif /* !CONFIG_UTS_NS */ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1976,9 +1927,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, #define OP_SET 0 #define OP_AND 1 -#define OP_OR 2 -#define OP_MAX 3 -#define OP_MIN 4 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1990,13 +1938,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, switch(op) { case OP_SET: *valp = val; break; case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - case OP_MAX: if(*valp < val) - *valp = val; - break; - case OP_MIN: if(*valp > val) - *valp = val; - break; } } else { int val = *valp; @@ -2391,46 +2332,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, } #ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - void *data; - struct ipc_namespace *ns; - - ns = current->nsproxy->ipc_ns; - - switch (table->ctl_name) { - case KERN_SHMMAX: - data = &ns->shm_ctlmax; - goto proc_minmax; - case KERN_SHMALL: - data = &ns->shm_ctlall; - goto proc_minmax; - case KERN_SHMMNI: - data = &ns->shm_ctlmni; - break; - case KERN_MSGMAX: - data = &ns->msg_ctlmax; - break; - case KERN_MSGMNI: - data = &ns->msg_ctlmni; - break; - case KERN_MSGMNB: - data = &ns->msg_ctlmnb; - break; - case KERN_SEM: - data = &ns->sem_ctls; - break; - default: - return -EINVAL; - } - - return __do_proc_dointvec(data, table, write, filp, buffer, + void *which; + which = get_ipc(table, write); + return __do_proc_dointvec(which, table, write, filp, buffer, lenp, ppos, NULL, NULL); -proc_minmax: - return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, +} + +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *which; + which = get_ipc(table, write); + return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, lenp, ppos, 1l, 1l); } + #endif static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, @@ -2475,6 +2394,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, { return -ENOSYS; } +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif int proc_dointvec(ctl_table *table, int write, struct file *filp, @@ -2539,7 +2469,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, /* The generic string strategy routine: */ int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (!table->data || !table->maxlen) return -ENOTDIR; @@ -2585,7 +2515,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, */ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (newval && newlen) { @@ -2621,7 +2551,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2649,7 +2579,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2674,6 +2604,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return 1; } + +/* The generic string strategy routine: */ +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct ctl_table uts_table; + int r, write; + write = newval && newlen; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = sysctl_string(&uts_table, name, nlen, + oldval, oldlenp, newval, newlen); + put_uts(table, write, uts_table.data); + return r; +} + +#ifdef CONFIG_SYSVIPC +/* The generic sysctl ipc data routine. */ +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + size_t len; + void *data; + + /* Get out of I don't have a variable */ + if (!table->data || !table->maxlen) + return -ENOTDIR; + + data = get_ipc(table, 1); + if (!data) + return -ENOTDIR; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, data, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (newval && newlen) { + if (newlen > table->maxlen) + newlen = table->maxlen; + + if (copy_from_user(data, newval, newlen)) + return -EFAULT; + } + return 1; +} +#endif + #else /* CONFIG_SYSCTL_SYSCALL */ @@ -2712,32 +2700,44 @@ out: int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_intvec(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + return -ENOSYS; +} +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + return -ENOSYS; +} #endif /* CONFIG_SYSCTL_SYSCALL */ /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 74eca59..22504af 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c) /* check if clocksource is already registered */ if (is_registered_source(c)) { printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); + "Already registered!", c->name); ret = -EBUSY; } else { /* register it */ @@ -186,6 +186,7 @@ void clocksource_reselect(void) } EXPORT_SYMBOL(clocksource_reselect); +#ifdef CONFIG_SYSFS /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused @@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) * Sysfs setup bits: */ static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, - sysfs_override_clocksource); + sysfs_override_clocksource); static SYSDEV_ATTR(available_clocksource, 0600, - sysfs_show_available_clocksources, NULL); + sysfs_show_available_clocksources, NULL); static struct sysdev_class clocksource_sysclass = { set_kset_name("clocksource"), @@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void) } device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ /** * boot_override_clocksource - boot clock override diff --git a/kernel/timer.c b/kernel/timer.c index c1c7fbc..c2a8ccf 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + */ + if (rem < HZ/4) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + /* + * In theory the following code can skip a jiffy in case jiffies + * increments right between the addition and the later subtraction. + * However since the entire point of this function is to use approximate + * timeouts, it's entirely ok to not handle that. + */ + return __round_jiffies(j + jiffies, cpu) - jiffies; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return __round_jiffies(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + + static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { @@ -714,7 +846,7 @@ static int change_clocksource(void) clock = new; clock->cycle_last = now; printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); + clock->name); return 1; } else if (clock->update_callback) { return clock->update_callback(); @@ -722,7 +854,10 @@ static int change_clocksource(void) return 0; } #else -#define change_clocksource() (0) +static inline int change_clocksource(void) +{ + return 0; +} #endif /** @@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, + s64 *offset) { s64 tick_error, i; u32 look_ahead, adj; @@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 * * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error = current_tick_length() >> + (TICK_LENGTH_SHIFT - clock->shift + 1); tick_error -= clock->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) clock->mult += adj; clock->xtime_interval += interval; clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); + clock->error -= (interval - offset) << + (TICK_LENGTH_SHIFT - clock->shift); } /** @@ -1008,11 +1146,15 @@ static inline void calc_load(unsigned long ticks) unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; - active_tasks = count_active_tasks(); - for (count -= ticks; count < 0; count += LOAD_FREQ) { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); + count -= ticks; + if (unlikely(count < 0)) { + active_tasks = count_active_tasks(); + do { + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + count += LOAD_FREQ; + } while (count < 0); } } @@ -1202,11 +1344,10 @@ fastcall signed long __sched schedule_timeout(signed long timeout) * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ - if (timeout < 0) - { + if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); + "value %lx\n", timeout); + dump_stack(); current->state = TASK_RUNNING; goto out; } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 96f7701..baacc36 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->write_char = p->wchar; stats->read_syscalls = p->syscr; stats->write_syscalls = p->syscw; +#ifdef CONFIG_TASK_IO_ACCOUNTING + stats->read_bytes = p->ioac.read_bytes; + stats->write_bytes = p->ioac.write_bytes; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; +#else + stats->read_bytes = 0; + stats->write_bytes = 0; + stats->cancelled_write_bytes = 0; +#endif } #undef KB #undef MB diff --git a/kernel/unwind.c b/kernel/unwind.c deleted file mode 100644 index 09c2613..0000000 --- a/kernel/unwind.c +++ /dev/null @@ -1,1305 +0,0 @@ -/* - * Copyright (C) 2002-2006 Novell, Inc. - * Jan Beulich <jbeulich@novell.com> - * This code is released under version 2 of the GNU GPL. - * - * A simple API for unwinding kernel stacks. This is used for - * debugging and error reporting purposes. The kernel doesn't need - * full-blown stack unwinding with all the bells and whistles, so there - * is not much point in implementing the full Dwarf2 unwind API. - */ - -#include <linux/unwind.h> -#include <linux/module.h> -#include <linux/bootmem.h> -#include <linux/sort.h> -#include <linux/stop_machine.h> -#include <linux/uaccess.h> -#include <asm/sections.h> -#include <asm/uaccess.h> -#include <asm/unaligned.h> - -extern const char __start_unwind[], __end_unwind[]; -extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; - -#define MAX_STACK_DEPTH 8 - -#define EXTRA_INFO(f) { \ - BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ - % FIELD_SIZEOF(struct unwind_frame_info, f)) \ - + offsetof(struct unwind_frame_info, f) \ - / FIELD_SIZEOF(struct unwind_frame_info, f), \ - FIELD_SIZEOF(struct unwind_frame_info, f) \ - } -#define PTREGS_INFO(f) EXTRA_INFO(regs.f) - -static const struct { - unsigned offs:BITS_PER_LONG / 2; - unsigned width:BITS_PER_LONG / 2; -} reg_info[] = { - UNW_REGISTER_INFO -}; - -#undef PTREGS_INFO -#undef EXTRA_INFO - -#ifndef REG_INVALID -#define REG_INVALID(r) (reg_info[r].width == 0) -#endif - -#define DW_CFA_nop 0x00 -#define DW_CFA_set_loc 0x01 -#define DW_CFA_advance_loc1 0x02 -#define DW_CFA_advance_loc2 0x03 -#define DW_CFA_advance_loc4 0x04 -#define DW_CFA_offset_extended 0x05 -#define DW_CFA_restore_extended 0x06 -#define DW_CFA_undefined 0x07 -#define DW_CFA_same_value 0x08 -#define DW_CFA_register 0x09 -#define DW_CFA_remember_state 0x0a -#define DW_CFA_restore_state 0x0b -#define DW_CFA_def_cfa 0x0c -#define DW_CFA_def_cfa_register 0x0d -#define DW_CFA_def_cfa_offset 0x0e -#define DW_CFA_def_cfa_expression 0x0f -#define DW_CFA_expression 0x10 -#define DW_CFA_offset_extended_sf 0x11 -#define DW_CFA_def_cfa_sf 0x12 -#define DW_CFA_def_cfa_offset_sf 0x13 -#define DW_CFA_val_offset 0x14 -#define DW_CFA_val_offset_sf 0x15 -#define DW_CFA_val_expression 0x16 -#define DW_CFA_lo_user 0x1c -#define DW_CFA_GNU_window_save 0x2d -#define DW_CFA_GNU_args_size 0x2e -#define DW_CFA_GNU_negative_offset_extended 0x2f -#define DW_CFA_hi_user 0x3f - -#define DW_EH_PE_FORM 0x07 -#define DW_EH_PE_native 0x00 -#define DW_EH_PE_leb128 0x01 -#define DW_EH_PE_data2 0x02 -#define DW_EH_PE_data4 0x03 -#define DW_EH_PE_data8 0x04 -#define DW_EH_PE_signed 0x08 -#define DW_EH_PE_ADJUST 0x70 -#define DW_EH_PE_abs 0x00 -#define DW_EH_PE_pcrel 0x10 -#define DW_EH_PE_textrel 0x20 -#define DW_EH_PE_datarel 0x30 -#define DW_EH_PE_funcrel 0x40 -#define DW_EH_PE_aligned 0x50 -#define DW_EH_PE_indirect 0x80 -#define DW_EH_PE_omit 0xff - -typedef unsigned long uleb128_t; -typedef signed long sleb128_t; -#define sleb128abs __builtin_labs - -static struct unwind_table { - struct { - unsigned long pc; - unsigned long range; - } core, init; - const void *address; - unsigned long size; - const unsigned char *header; - unsigned long hdrsz; - struct unwind_table *link; - const char *name; -} root_table; - -struct unwind_item { - enum item_location { - Nowhere, - Memory, - Register, - Value - } where; - uleb128_t value; -}; - -struct unwind_state { - uleb128_t loc, org; - const u8 *cieStart, *cieEnd; - uleb128_t codeAlign; - sleb128_t dataAlign; - struct cfa { - uleb128_t reg, offs; - } cfa; - struct unwind_item regs[ARRAY_SIZE(reg_info)]; - unsigned stackDepth:8; - unsigned version:8; - const u8 *label; - const u8 *stack[MAX_STACK_DEPTH]; -}; - -static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; - -static unsigned unwind_debug; -static int __init unwind_debug_setup(char *s) -{ - unwind_debug = simple_strtoul(s, NULL, 0); - return 1; -} -__setup("unwind_debug=", unwind_debug_setup); -#define dprintk(lvl, fmt, args...) \ - ((void)(lvl > unwind_debug \ - || printk(KERN_DEBUG "unwind: " fmt "\n", ##args))) - -static struct unwind_table *find_table(unsigned long pc) -{ - struct unwind_table *table; - - for (table = &root_table; table; table = table->link) - if ((pc >= table->core.pc - && pc < table->core.pc + table->core.range) - || (pc >= table->init.pc - && pc < table->init.pc + table->init.range)) - break; - - return table; -} - -static unsigned long read_pointer(const u8 **pLoc, - const void *end, - signed ptrType, - unsigned long text_base, - unsigned long data_base); - -static void init_unwind_table(struct unwind_table *table, - const char *name, - const void *core_start, - unsigned long core_size, - const void *init_start, - unsigned long init_size, - const void *table_start, - unsigned long table_size, - const u8 *header_start, - unsigned long header_size) -{ - const u8 *ptr = header_start + 4; - const u8 *end = header_start + header_size; - - table->core.pc = (unsigned long)core_start; - table->core.range = core_size; - table->init.pc = (unsigned long)init_start; - table->init.range = init_size; - table->address = table_start; - table->size = table_size; - /* See if the linker provided table looks valid. */ - if (header_size <= 4 - || header_start[0] != 1 - || (void *)read_pointer(&ptr, end, header_start[1], 0, 0) - != table_start - || !read_pointer(&ptr, end, header_start[2], 0, 0) - || !read_pointer(&ptr, end, header_start[3], 0, - (unsigned long)header_start) - || !read_pointer(&ptr, end, header_start[3], 0, - (unsigned long)header_start)) - header_start = NULL; - table->hdrsz = header_size; - smp_wmb(); - table->header = header_start; - table->link = NULL; - table->name = name; -} - -void __init unwind_init(void) -{ - init_unwind_table(&root_table, "kernel", - _text, _end - _text, - NULL, 0, - __start_unwind, __end_unwind - __start_unwind, - __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); -} - -static const u32 bad_cie, not_fde; -static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); -static signed fde_pointer_type(const u32 *cie); - -struct eh_frame_hdr_table_entry { - unsigned long start, fde; -}; - -static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) -{ - const struct eh_frame_hdr_table_entry *e1 = p1; - const struct eh_frame_hdr_table_entry *e2 = p2; - - return (e1->start > e2->start) - (e1->start < e2->start); -} - -static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) -{ - struct eh_frame_hdr_table_entry *e1 = p1; - struct eh_frame_hdr_table_entry *e2 = p2; - unsigned long v; - - v = e1->start; - e1->start = e2->start; - e2->start = v; - v = e1->fde; - e1->fde = e2->fde; - e2->fde = v; -} - -static void __init setup_unwind_table(struct unwind_table *table, - void *(*alloc)(unsigned long)) -{ - const u8 *ptr; - unsigned long tableSize = table->size, hdrSize; - unsigned n; - const u32 *fde; - struct { - u8 version; - u8 eh_frame_ptr_enc; - u8 fde_count_enc; - u8 table_enc; - unsigned long eh_frame_ptr; - unsigned int fde_count; - struct eh_frame_hdr_table_entry table[]; - } __attribute__((__packed__)) *header; - - if (table->header) - return; - - if (table->hdrsz) - printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", - table->name); - - if (tableSize & (sizeof(*fde) - 1)) - return; - - for (fde = table->address, n = 0; - tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; - tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { - const u32 *cie = cie_for_fde(fde, table); - signed ptrType; - - if (cie == ¬_fde) - continue; - if (cie == NULL - || cie == &bad_cie - || (ptrType = fde_pointer_type(cie)) < 0) - return; - ptr = (const u8 *)(fde + 2); - if (!read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType, 0, 0)) - return; - ++n; - } - - if (tableSize || !n) - return; - - hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) - + 2 * n * sizeof(unsigned long); - dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize); - header = alloc(hdrSize); - if (!header) - return; - header->version = 1; - header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; - header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; - header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; - put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); - BUILD_BUG_ON(offsetof(typeof(*header), fde_count) - % __alignof(typeof(header->fde_count))); - header->fde_count = n; - - BUILD_BUG_ON(offsetof(typeof(*header), table) - % __alignof(typeof(*header->table))); - for (fde = table->address, tableSize = table->size, n = 0; - tableSize; - tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { - const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); - - if (!fde[1]) - continue; /* this is a CIE */ - ptr = (const u8 *)(fde + 2); - header->table[n].start = read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - fde_pointer_type(cie), 0, 0); - header->table[n].fde = (unsigned long)fde; - ++n; - } - WARN_ON(n != header->fde_count); - - sort(header->table, - n, - sizeof(*header->table), - cmp_eh_frame_hdr_table_entries, - swap_eh_frame_hdr_table_entries); - - table->hdrsz = hdrSize; - smp_wmb(); - table->header = (const void *)header; -} - -static void *__init balloc(unsigned long sz) -{ - return __alloc_bootmem_nopanic(sz, - sizeof(unsigned int), - __pa(MAX_DMA_ADDRESS)); -} - -void __init unwind_setup(void) -{ - setup_unwind_table(&root_table, balloc); -} - -#ifdef CONFIG_MODULES - -static struct unwind_table *last_table; - -/* Must be called with module_mutex held. */ -void *unwind_add_table(struct module *module, - const void *table_start, - unsigned long table_size) -{ - struct unwind_table *table; - - if (table_size <= 0) - return NULL; - - table = kmalloc(sizeof(*table), GFP_KERNEL); - if (!table) - return NULL; - - init_unwind_table(table, module->name, - module->module_core, module->core_size, - module->module_init, module->init_size, - table_start, table_size, - NULL, 0); - - if (last_table) - last_table->link = table; - else - root_table.link = table; - last_table = table; - - return table; -} - -struct unlink_table_info -{ - struct unwind_table *table; - int init_only; -}; - -static int unlink_table(void *arg) -{ - struct unlink_table_info *info = arg; - struct unwind_table *table = info->table, *prev; - - for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) - ; - - if (prev->link) { - if (info->init_only) { - table->init.pc = 0; - table->init.range = 0; - info->table = NULL; - } else { - prev->link = table->link; - if (!prev->link) - last_table = prev; - } - } else - info->table = NULL; - - return 0; -} - -/* Must be called with module_mutex held. */ -void unwind_remove_table(void *handle, int init_only) -{ - struct unwind_table *table = handle; - struct unlink_table_info info; - - if (!table || table == &root_table) - return; - - if (init_only && table == last_table) { - table->init.pc = 0; - table->init.range = 0; - return; - } - - info.table = table; - info.init_only = init_only; - stop_machine_run(unlink_table, &info, NR_CPUS); - - if (info.table) - kfree(table); -} - -#endif /* CONFIG_MODULES */ - -static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) -{ - const u8 *cur = *pcur; - uleb128_t value; - unsigned shift; - - for (shift = 0, value = 0; cur < end; shift += 7) { - if (shift + 7 > 8 * sizeof(value) - && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { - cur = end + 1; - break; - } - value |= (uleb128_t)(*cur & 0x7f) << shift; - if (!(*cur++ & 0x80)) - break; - } - *pcur = cur; - - return value; -} - -static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) -{ - const u8 *cur = *pcur; - sleb128_t value; - unsigned shift; - - for (shift = 0, value = 0; cur < end; shift += 7) { - if (shift + 7 > 8 * sizeof(value) - && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { - cur = end + 1; - break; - } - value |= (sleb128_t)(*cur & 0x7f) << shift; - if (!(*cur & 0x80)) { - value |= -(*cur++ & 0x40) << shift; - break; - } - } - *pcur = cur; - - return value; -} - -static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) -{ - const u32 *cie; - - if (!*fde || (*fde & (sizeof(*fde) - 1))) - return &bad_cie; - if (!fde[1]) - return ¬_fde; /* this is a CIE */ - if ((fde[1] & (sizeof(*fde) - 1)) - || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) - return NULL; /* this is not a valid FDE */ - cie = fde + 1 - fde[1] / sizeof(*fde); - if (*cie <= sizeof(*cie) + 4 - || *cie >= fde[1] - sizeof(*fde) - || (*cie & (sizeof(*cie) - 1)) - || cie[1]) - return NULL; /* this is not a (valid) CIE */ - return cie; -} - -static unsigned long read_pointer(const u8 **pLoc, - const void *end, - signed ptrType, - unsigned long text_base, - unsigned long data_base) -{ - unsigned long value = 0; - union { - const u8 *p8; - const u16 *p16u; - const s16 *p16s; - const u32 *p32u; - const s32 *p32s; - const unsigned long *pul; - } ptr; - - if (ptrType < 0 || ptrType == DW_EH_PE_omit) { - dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end); - return 0; - } - ptr.p8 = *pLoc; - switch(ptrType & DW_EH_PE_FORM) { - case DW_EH_PE_data2: - if (end < (const void *)(ptr.p16u + 1)) { - dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end); - return 0; - } - if(ptrType & DW_EH_PE_signed) - value = get_unaligned(ptr.p16s++); - else - value = get_unaligned(ptr.p16u++); - break; - case DW_EH_PE_data4: -#ifdef CONFIG_64BIT - if (end < (const void *)(ptr.p32u + 1)) { - dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end); - return 0; - } - if(ptrType & DW_EH_PE_signed) - value = get_unaligned(ptr.p32s++); - else - value = get_unaligned(ptr.p32u++); - break; - case DW_EH_PE_data8: - BUILD_BUG_ON(sizeof(u64) != sizeof(value)); -#else - BUILD_BUG_ON(sizeof(u32) != sizeof(value)); -#endif - case DW_EH_PE_native: - if (end < (const void *)(ptr.pul + 1)) { - dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end); - return 0; - } - value = get_unaligned(ptr.pul++); - break; - case DW_EH_PE_leb128: - BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); - value = ptrType & DW_EH_PE_signed - ? get_sleb128(&ptr.p8, end) - : get_uleb128(&ptr.p8, end); - if ((const void *)ptr.p8 > end) { - dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end); - return 0; - } - break; - default: - dprintk(2, "Cannot decode pointer type %02X (%p,%p).", - ptrType, ptr.p8, end); - return 0; - } - switch(ptrType & DW_EH_PE_ADJUST) { - case DW_EH_PE_abs: - break; - case DW_EH_PE_pcrel: - value += (unsigned long)*pLoc; - break; - case DW_EH_PE_textrel: - if (likely(text_base)) { - value += text_base; - break; - } - dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.", - ptrType, *pLoc, end); - return 0; - case DW_EH_PE_datarel: - if (likely(data_base)) { - value += data_base; - break; - } - dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.", - ptrType, *pLoc, end); - return 0; - default: - dprintk(2, "Cannot adjust pointer type %02X (%p,%p).", - ptrType, *pLoc, end); - return 0; - } - if ((ptrType & DW_EH_PE_indirect) - && probe_kernel_address((unsigned long *)value, value)) { - dprintk(1, "Cannot read indirect value %lx (%p,%p).", - value, *pLoc, end); - return 0; - } - *pLoc = ptr.p8; - - return value; -} - -static signed fde_pointer_type(const u32 *cie) -{ - const u8 *ptr = (const u8 *)(cie + 2); - unsigned version = *ptr; - - if (version != 1) - return -1; /* unsupported */ - if (*++ptr) { - const char *aug; - const u8 *end = (const u8 *)(cie + 1) + *cie; - uleb128_t len; - - /* check if augmentation size is first (and thus present) */ - if (*ptr != 'z') - return -1; - /* check if augmentation string is nul-terminated */ - if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) - return -1; - ++ptr; /* skip terminator */ - get_uleb128(&ptr, end); /* skip code alignment */ - get_sleb128(&ptr, end); /* skip data alignment */ - /* skip return address column */ - version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); - len = get_uleb128(&ptr, end); /* augmentation length */ - if (ptr + len < ptr || ptr + len > end) - return -1; - end = ptr + len; - while (*++aug) { - if (ptr >= end) - return -1; - switch(*aug) { - case 'L': - ++ptr; - break; - case 'P': { - signed ptrType = *ptr++; - - if (!read_pointer(&ptr, end, ptrType, 0, 0) - || ptr > end) - return -1; - } - break; - case 'R': - return *ptr; - default: - return -1; - } - } - } - return DW_EH_PE_native|DW_EH_PE_abs; -} - -static int advance_loc(unsigned long delta, struct unwind_state *state) -{ - state->loc += delta * state->codeAlign; - - return delta > 0; -} - -static void set_rule(uleb128_t reg, - enum item_location where, - uleb128_t value, - struct unwind_state *state) -{ - if (reg < ARRAY_SIZE(state->regs)) { - state->regs[reg].where = where; - state->regs[reg].value = value; - } -} - -static int processCFI(const u8 *start, - const u8 *end, - unsigned long targetLoc, - signed ptrType, - struct unwind_state *state) -{ - union { - const u8 *p8; - const u16 *p16; - const u32 *p32; - } ptr; - int result = 1; - - if (start != state->cieStart) { - state->loc = state->org; - result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); - if (targetLoc == 0 && state->label == NULL) - return result; - } - for (ptr.p8 = start; result && ptr.p8 < end; ) { - switch(*ptr.p8 >> 6) { - uleb128_t value; - - case 0: - switch(*ptr.p8++) { - case DW_CFA_nop: - break; - case DW_CFA_set_loc: - state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0); - if (state->loc == 0) - result = 0; - break; - case DW_CFA_advance_loc1: - result = ptr.p8 < end && advance_loc(*ptr.p8++, state); - break; - case DW_CFA_advance_loc2: - result = ptr.p8 <= end + 2 - && advance_loc(*ptr.p16++, state); - break; - case DW_CFA_advance_loc4: - result = ptr.p8 <= end + 4 - && advance_loc(*ptr.p32++, state); - break; - case DW_CFA_offset_extended: - value = get_uleb128(&ptr.p8, end); - set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); - break; - case DW_CFA_val_offset: - value = get_uleb128(&ptr.p8, end); - set_rule(value, Value, get_uleb128(&ptr.p8, end), state); - break; - case DW_CFA_offset_extended_sf: - value = get_uleb128(&ptr.p8, end); - set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); - break; - case DW_CFA_val_offset_sf: - value = get_uleb128(&ptr.p8, end); - set_rule(value, Value, get_sleb128(&ptr.p8, end), state); - break; - case DW_CFA_restore_extended: - case DW_CFA_undefined: - case DW_CFA_same_value: - set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); - break; - case DW_CFA_register: - value = get_uleb128(&ptr.p8, end); - set_rule(value, - Register, - get_uleb128(&ptr.p8, end), state); - break; - case DW_CFA_remember_state: - if (ptr.p8 == state->label) { - state->label = NULL; - return 1; - } - if (state->stackDepth >= MAX_STACK_DEPTH) { - dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end); - return 0; - } - state->stack[state->stackDepth++] = ptr.p8; - break; - case DW_CFA_restore_state: - if (state->stackDepth) { - const uleb128_t loc = state->loc; - const u8 *label = state->label; - - state->label = state->stack[state->stackDepth - 1]; - memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); - memset(state->regs, 0, sizeof(state->regs)); - state->stackDepth = 0; - result = processCFI(start, end, 0, ptrType, state); - state->loc = loc; - state->label = label; - } else { - dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end); - return 0; - } - break; - case DW_CFA_def_cfa: - state->cfa.reg = get_uleb128(&ptr.p8, end); - /*nobreak*/ - case DW_CFA_def_cfa_offset: - state->cfa.offs = get_uleb128(&ptr.p8, end); - break; - case DW_CFA_def_cfa_sf: - state->cfa.reg = get_uleb128(&ptr.p8, end); - /*nobreak*/ - case DW_CFA_def_cfa_offset_sf: - state->cfa.offs = get_sleb128(&ptr.p8, end) - * state->dataAlign; - break; - case DW_CFA_def_cfa_register: - state->cfa.reg = get_uleb128(&ptr.p8, end); - break; - /*todo case DW_CFA_def_cfa_expression: */ - /*todo case DW_CFA_expression: */ - /*todo case DW_CFA_val_expression: */ - case DW_CFA_GNU_args_size: - get_uleb128(&ptr.p8, end); - break; - case DW_CFA_GNU_negative_offset_extended: - value = get_uleb128(&ptr.p8, end); - set_rule(value, - Memory, - (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); - break; - case DW_CFA_GNU_window_save: - default: - dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end); - result = 0; - break; - } - break; - case 1: - result = advance_loc(*ptr.p8++ & 0x3f, state); - break; - case 2: - value = *ptr.p8++ & 0x3f; - set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); - break; - case 3: - set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); - break; - } - if (ptr.p8 > end) { - dprintk(1, "Data overrun (%p,%p).", ptr.p8, end); - result = 0; - } - if (result && targetLoc != 0 && targetLoc < state->loc) - return 1; - } - - if (result && ptr.p8 < end) - dprintk(1, "Data underrun (%p,%p).", ptr.p8, end); - - return result - && ptr.p8 == end - && (targetLoc == 0 - || (/*todo While in theory this should apply, gcc in practice omits - everything past the function prolog, and hence the location - never reaches the end of the function. - targetLoc < state->loc &&*/ state->label == NULL)); -} - -/* Unwind to previous to frame. Returns 0 if successful, negative - * number in case of an error. */ -int unwind(struct unwind_frame_info *frame) -{ -#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) - const u32 *fde = NULL, *cie = NULL; - const u8 *ptr = NULL, *end = NULL; - unsigned long pc = UNW_PC(frame) - frame->call_frame, sp; - unsigned long startLoc = 0, endLoc = 0, cfa; - unsigned i; - signed ptrType = -1; - uleb128_t retAddrReg = 0; - const struct unwind_table *table; - struct unwind_state state; - - if (UNW_PC(frame) == 0) - return -EINVAL; - if ((table = find_table(pc)) != NULL - && !(table->size & (sizeof(*fde) - 1))) { - const u8 *hdr = table->header; - unsigned long tableSize; - - smp_rmb(); - if (hdr && hdr[0] == 1) { - switch(hdr[3] & DW_EH_PE_FORM) { - case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; - case DW_EH_PE_data2: tableSize = 2; break; - case DW_EH_PE_data4: tableSize = 4; break; - case DW_EH_PE_data8: tableSize = 8; break; - default: tableSize = 0; break; - } - ptr = hdr + 4; - end = hdr + table->hdrsz; - if (tableSize - && read_pointer(&ptr, end, hdr[1], 0, 0) - == (unsigned long)table->address - && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0 - && i == (end - ptr) / (2 * tableSize) - && !((end - ptr) % (2 * tableSize))) { - do { - const u8 *cur = ptr + (i / 2) * (2 * tableSize); - - startLoc = read_pointer(&cur, - cur + tableSize, - hdr[3], 0, - (unsigned long)hdr); - if (pc < startLoc) - i /= 2; - else { - ptr = cur - tableSize; - i = (i + 1) / 2; - } - } while (startLoc && i > 1); - if (i == 1 - && (startLoc = read_pointer(&ptr, - ptr + tableSize, - hdr[3], 0, - (unsigned long)hdr)) != 0 - && pc >= startLoc) - fde = (void *)read_pointer(&ptr, - ptr + tableSize, - hdr[3], 0, - (unsigned long)hdr); - } - } - if(hdr && !fde) - dprintk(3, "Binary lookup for %lx failed.", pc); - - if (fde != NULL) { - cie = cie_for_fde(fde, table); - ptr = (const u8 *)(fde + 2); - if(cie != NULL - && cie != &bad_cie - && cie != ¬_fde - && (ptrType = fde_pointer_type(cie)) >= 0 - && read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType, 0, 0) == startLoc) { - if (!(ptrType & DW_EH_PE_indirect)) - ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; - endLoc = startLoc - + read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType, 0, 0); - if(pc >= endLoc) - fde = NULL; - } else - fde = NULL; - if(!fde) - dprintk(1, "Binary lookup result for %lx discarded.", pc); - } - if (fde == NULL) { - for (fde = table->address, tableSize = table->size; - cie = NULL, tableSize > sizeof(*fde) - && tableSize - sizeof(*fde) >= *fde; - tableSize -= sizeof(*fde) + *fde, - fde += 1 + *fde / sizeof(*fde)) { - cie = cie_for_fde(fde, table); - if (cie == &bad_cie) { - cie = NULL; - break; - } - if (cie == NULL - || cie == ¬_fde - || (ptrType = fde_pointer_type(cie)) < 0) - continue; - ptr = (const u8 *)(fde + 2); - startLoc = read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType, 0, 0); - if (!startLoc) - continue; - if (!(ptrType & DW_EH_PE_indirect)) - ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; - endLoc = startLoc - + read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType, 0, 0); - if (pc >= startLoc && pc < endLoc) - break; - } - if(!fde) - dprintk(3, "Linear lookup for %lx failed.", pc); - } - } - if (cie != NULL) { - memset(&state, 0, sizeof(state)); - state.cieEnd = ptr; /* keep here temporarily */ - ptr = (const u8 *)(cie + 2); - end = (const u8 *)(cie + 1) + *cie; - frame->call_frame = 1; - if ((state.version = *ptr) != 1) - cie = NULL; /* unsupported version */ - else if (*++ptr) { - /* check if augmentation size is first (and thus present) */ - if (*ptr == 'z') { - while (++ptr < end && *ptr) { - switch(*ptr) { - /* check for ignorable (or already handled) - * nul-terminated augmentation string */ - case 'L': - case 'P': - case 'R': - continue; - case 'S': - frame->call_frame = 0; - continue; - default: - break; - } - break; - } - } - if (ptr >= end || *ptr) - cie = NULL; - } - if(!cie) - dprintk(1, "CIE unusable (%p,%p).", ptr, end); - ++ptr; - } - if (cie != NULL) { - /* get code aligment factor */ - state.codeAlign = get_uleb128(&ptr, end); - /* get data aligment factor */ - state.dataAlign = get_sleb128(&ptr, end); - if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) - cie = NULL; - else if (UNW_PC(frame) % state.codeAlign - || UNW_SP(frame) % sleb128abs(state.dataAlign)) { - dprintk(1, "Input pointer(s) misaligned (%lx,%lx).", - UNW_PC(frame), UNW_SP(frame)); - return -EPERM; - } else { - retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); - /* skip augmentation */ - if (((const char *)(cie + 2))[1] == 'z') { - uleb128_t augSize = get_uleb128(&ptr, end); - - ptr += augSize; - } - if (ptr > end - || retAddrReg >= ARRAY_SIZE(reg_info) - || REG_INVALID(retAddrReg) - || reg_info[retAddrReg].width != sizeof(unsigned long)) - cie = NULL; - } - if(!cie) - dprintk(1, "CIE validation failed (%p,%p).", ptr, end); - } - if (cie != NULL) { - state.cieStart = ptr; - ptr = state.cieEnd; - state.cieEnd = end; - end = (const u8 *)(fde + 1) + *fde; - /* skip augmentation */ - if (((const char *)(cie + 2))[1] == 'z') { - uleb128_t augSize = get_uleb128(&ptr, end); - - if ((ptr += augSize) > end) - fde = NULL; - } - if(!fde) - dprintk(1, "FDE validation failed (%p,%p).", ptr, end); - } - if (cie == NULL || fde == NULL) { -#ifdef CONFIG_FRAME_POINTER - unsigned long top, bottom; - - if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long)) - return -EPERM; - top = STACK_TOP(frame->task); - bottom = STACK_BOTTOM(frame->task); -# if FRAME_RETADDR_OFFSET < 0 - if (UNW_SP(frame) < top - && UNW_FP(frame) <= UNW_SP(frame) - && bottom < UNW_FP(frame) -# else - if (UNW_SP(frame) > top - && UNW_FP(frame) >= UNW_SP(frame) - && bottom > UNW_FP(frame) -# endif - && !((UNW_SP(frame) | UNW_FP(frame)) - & (sizeof(unsigned long) - 1))) { - unsigned long link; - - if (!probe_kernel_address( - (unsigned long *)(UNW_FP(frame) - + FRAME_LINK_OFFSET), - link) -# if FRAME_RETADDR_OFFSET < 0 - && link > bottom && link < UNW_FP(frame) -# else - && link > UNW_FP(frame) && link < bottom -# endif - && !(link & (sizeof(link) - 1)) - && !probe_kernel_address( - (unsigned long *)(UNW_FP(frame) - + FRAME_RETADDR_OFFSET), UNW_PC(frame))) { - UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET -# if FRAME_RETADDR_OFFSET < 0 - - -# else - + -# endif - sizeof(UNW_PC(frame)); - UNW_FP(frame) = link; - return 0; - } - } -#endif - return -ENXIO; - } - state.org = startLoc; - memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); - /* process instructions */ - if (!processCFI(ptr, end, pc, ptrType, &state) - || state.loc > endLoc - || state.regs[retAddrReg].where == Nowhere - || state.cfa.reg >= ARRAY_SIZE(reg_info) - || reg_info[state.cfa.reg].width != sizeof(unsigned long) - || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long) - || state.cfa.offs % sizeof(unsigned long)) { - dprintk(1, "Unusable unwind info (%p,%p).", ptr, end); - return -EIO; - } - /* update frame */ -#ifndef CONFIG_AS_CFI_SIGNAL_FRAME - if(frame->call_frame - && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign)) - frame->call_frame = 0; -#endif - cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; - startLoc = min((unsigned long)UNW_SP(frame), cfa); - endLoc = max((unsigned long)UNW_SP(frame), cfa); - if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { - startLoc = min(STACK_LIMIT(cfa), cfa); - endLoc = max(STACK_LIMIT(cfa), cfa); - } -#ifndef CONFIG_64BIT -# define CASES CASE(8); CASE(16); CASE(32) -#else -# define CASES CASE(8); CASE(16); CASE(32); CASE(64) -#endif - pc = UNW_PC(frame); - sp = UNW_SP(frame); - for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { - if (REG_INVALID(i)) { - if (state.regs[i].where == Nowhere) - continue; - dprintk(1, "Cannot restore register %u (%d).", - i, state.regs[i].where); - return -EIO; - } - switch(state.regs[i].where) { - default: - break; - case Register: - if (state.regs[i].value >= ARRAY_SIZE(reg_info) - || REG_INVALID(state.regs[i].value) - || reg_info[i].width > reg_info[state.regs[i].value].width) { - dprintk(1, "Cannot restore register %u from register %lu.", - i, state.regs[i].value); - return -EIO; - } - switch(reg_info[state.regs[i].value].width) { -#define CASE(n) \ - case sizeof(u##n): \ - state.regs[i].value = FRAME_REG(state.regs[i].value, \ - const u##n); \ - break - CASES; -#undef CASE - default: - dprintk(1, "Unsupported register size %u (%lu).", - reg_info[state.regs[i].value].width, - state.regs[i].value); - return -EIO; - } - break; - } - } - for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { - if (REG_INVALID(i)) - continue; - switch(state.regs[i].where) { - case Nowhere: - if (reg_info[i].width != sizeof(UNW_SP(frame)) - || &FRAME_REG(i, __typeof__(UNW_SP(frame))) - != &UNW_SP(frame)) - continue; - UNW_SP(frame) = cfa; - break; - case Register: - switch(reg_info[i].width) { -#define CASE(n) case sizeof(u##n): \ - FRAME_REG(i, u##n) = state.regs[i].value; \ - break - CASES; -#undef CASE - default: - dprintk(1, "Unsupported register size %u (%u).", - reg_info[i].width, i); - return -EIO; - } - break; - case Value: - if (reg_info[i].width != sizeof(unsigned long)) { - dprintk(1, "Unsupported value size %u (%u).", - reg_info[i].width, i); - return -EIO; - } - FRAME_REG(i, unsigned long) = cfa + state.regs[i].value - * state.dataAlign; - break; - case Memory: { - unsigned long addr = cfa + state.regs[i].value - * state.dataAlign; - - if ((state.regs[i].value * state.dataAlign) - % sizeof(unsigned long) - || addr < startLoc - || addr + sizeof(unsigned long) < addr - || addr + sizeof(unsigned long) > endLoc) { - dprintk(1, "Bad memory location %lx (%lx).", - addr, state.regs[i].value); - return -EIO; - } - switch(reg_info[i].width) { -#define CASE(n) case sizeof(u##n): \ - probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \ - break - CASES; -#undef CASE - default: - dprintk(1, "Unsupported memory size %u (%u).", - reg_info[i].width, i); - return -EIO; - } - } - break; - } - } - - if (UNW_PC(frame) % state.codeAlign - || UNW_SP(frame) % sleb128abs(state.dataAlign)) { - dprintk(1, "Output pointer(s) misaligned (%lx,%lx).", - UNW_PC(frame), UNW_SP(frame)); - return -EIO; - } - if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) { - dprintk(1, "No progress (%lx,%lx).", pc, sp); - return -EIO; - } - - return 0; -#undef CASES -#undef FRAME_REG -} -EXPORT_SYMBOL(unwind); - -int unwind_init_frame_info(struct unwind_frame_info *info, - struct task_struct *tsk, - /*const*/ struct pt_regs *regs) -{ - info->task = tsk; - info->call_frame = 0; - arch_unw_init_frame_info(info, regs); - - return 0; -} -EXPORT_SYMBOL(unwind_init_frame_info); - -/* - * Prepare to unwind a blocked task. - */ -int unwind_init_blocked(struct unwind_frame_info *info, - struct task_struct *tsk) -{ - info->task = tsk; - info->call_frame = 0; - arch_unw_init_blocked(info); - - return 0; -} -EXPORT_SYMBOL(unwind_init_blocked); - -/* - * Prepare to unwind the currently running thread. - */ -int unwind_init_running(struct unwind_frame_info *info, - asmlinkage int (*callback)(struct unwind_frame_info *, - void *arg), - void *arg) -{ - info->task = current; - info->call_frame = 0; - - return arch_unwind_init_running(info, callback, arg); -} -EXPORT_SYMBOL(unwind_init_running); - -/* - * Unwind until the return pointer is in user-land (or until an error - * occurs). Returns 0 if successful, negative number in case of - * error. - */ -int unwind_to_user(struct unwind_frame_info *info) -{ - while (!arch_unw_user_mode(info)) { - int err = unwind(info); - - if (err < 0) - return err; - } - - return 0; -} -EXPORT_SYMBOL(unwind_to_user); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6b18675..a3da07c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -85,27 +85,24 @@ static inline int is_single_threaded(struct workqueue_struct *wq) return list_empty(&wq->list); } +/* + * Set the workqueue on which a work item is to be run + * - Must *only* be called if the pending flag is set + */ static inline void set_wq_data(struct work_struct *work, void *wq) { - unsigned long new, old, res; + unsigned long new; + + BUG_ON(!work_pending(work)); - /* assume the pending flag is already set and that the task has already - * been queued on this workqueue */ new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); - res = work->management; - if (res != new) { - do { - old = res; - new = (unsigned long) wq; - new |= (old & WORK_STRUCT_FLAG_MASK); - res = cmpxchg(&work->management, old, new); - } while (res != old); - } + new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); + atomic_long_set(&work->data, new); } static inline void *get_wq_data(struct work_struct *work) { - return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK); + return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) @@ -136,7 +133,7 @@ static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work list_del_init(&work->entry); spin_unlock_irqrestore(&cwq->lock, flags); - if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) + if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) work_release(work); f(work); @@ -209,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { int ret = 0, cpu = get_cpu(); - if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { if (unlikely(is_single_threaded(wq))) cpu = singlethread_cpu; BUG_ON(!list_empty(&work->entry)); @@ -236,7 +233,7 @@ static void delayed_work_timer_fn(unsigned long __data) /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use - * @work: delayable work to queue + * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Returns 0 if @work was already on a queue, non-zero otherwise. @@ -251,7 +248,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, if (delay == 0) return queue_work(wq, work); - if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); @@ -271,7 +268,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * queue_delayed_work_on - queue work on specific CPU after delay * @cpu: CPU number to execute work on * @wq: workqueue to use - * @work: work to queue + * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * Returns 0 if @work was already on a queue, non-zero otherwise. @@ -283,7 +280,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); @@ -324,7 +321,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) spin_unlock_irqrestore(&cwq->lock, flags); BUG_ON(get_wq_data(work) != cwq); - if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) + if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) work_release(work); f(work); @@ -640,9 +637,11 @@ int schedule_on_each_cpu(work_func_t func) mutex_lock(&workqueue_mutex); for_each_online_cpu(cpu) { - INIT_WORK(per_cpu_ptr(works, cpu), func); - __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), - per_cpu_ptr(works, cpu)); + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, func); + set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); + __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); } mutex_unlock(&workqueue_mutex); flush_workqueue(keventd_wq); |