diff options
Diffstat (limited to 'kernel')
35 files changed, 1711 insertions, 1101 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0b46a5d..c64ce9c 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY "explicit preemption points" to the kernel code. These new preemption points have been selected to reduce the maximum latency of rescheduling, providing faster application reactions, - at the cost of slighly lower throughput. + at the cost of slightly lower throughput. This allows reaction to interactive events by allowing a low priority process to voluntarily preempt itself even if it @@ -43,7 +43,7 @@ config PREEMPT even if it is in kernel mode executing a system call and would otherwise not be about to reach a natural preemption point. This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slighly lower throughput + system is under load, at the cost of slightly lower throughput and a slight runtime overhead to kernel code. Select this if you are building a kernel for a desktop or diff --git a/kernel/configs.c b/kernel/configs.c index 8fa1fb2..e84d3f9 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -61,18 +61,9 @@ static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { - loff_t pos = *offset; - ssize_t count; - - if (pos >= kernel_config_data_size) - return 0; - - count = min(len, (size_t)(kernel_config_data_size - pos)); - if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count)) - return -EFAULT; - - *offset += count; - return count; + return simple_read_from_buffer(buf, len, offset, + kernel_config_data + MAGIC_SIZE, + kernel_config_data_size); } static const struct file_operations ikconfig_file_ops = { diff --git a/kernel/cpu.c b/kernel/cpu.c index 36e7084..208cf34 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu) (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %lx) \n", + (state = %ld, flags = %x) \n", p->comm, p->pid, cpu, p->state, p->flags); } write_unlock_irq(&tasklist_lock); @@ -120,11 +120,13 @@ static int take_cpu_down(void *unused) } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu) +static int _cpu_down(unsigned int cpu, int tasks_frozen) { - int err; + int err, nr_calls = 0; struct task_struct *p; cpumask_t old_allowed, tmp; + void *hcpu = (void *)(long)cpu; + unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; if (num_online_cpus() == 1) return -EBUSY; @@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu) if (!cpu_online(cpu)) return -EINVAL; - err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, - (void *)(long)cpu); + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); + err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, + hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { + __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, + hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", __FUNCTION__, cpu); - return -EINVAL; + err = -EINVAL; + goto out_release; } /* Ensure that we are not runnable on dying cpu */ @@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu) if (IS_ERR(p) || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, - (void *)(long)cpu) == NOTIFY_BAD) + if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, + hcpu) == NOTIFY_BAD) BUG(); if (IS_ERR(p)) { @@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu) /* This actually kills the CPU. */ __cpu_die(cpu); - /* Move it here so it can run. */ - kthread_bind(p, get_cpu()); - put_cpu(); - /* CPU is completely dead: tell everyone. Too late to complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, - (void *)(long)cpu) == NOTIFY_BAD) + if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, + hcpu) == NOTIFY_BAD) BUG(); check_for_tasks(cpu); @@ -185,6 +187,8 @@ out_thread: err = kthread_stop(p); out_allowed: set_cpus_allowed(current, old_allowed); +out_release: + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); return err; } @@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu) if (cpu_hotplug_disabled) err = -EBUSY; else - err = _cpu_down(cpu); + err = _cpu_down(cpu, 0); mutex_unlock(&cpu_add_remove_lock); return err; @@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu) #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu) +static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) { - int ret; + int ret, nr_calls = 0; void *hcpu = (void *)(long)cpu; + unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; - ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); + ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, + -1, &nr_calls); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); out_notify: if (ret != 0) - raw_notifier_call_chain(&cpu_chain, - CPU_UP_CANCELED, hcpu); + __raw_notifier_call_chain(&cpu_chain, + CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); return ret; } @@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu) if (cpu_hotplug_disabled) err = -EBUSY; else - err = _cpu_up(cpu); + err = _cpu_up(cpu, 0); mutex_unlock(&cpu_add_remove_lock); return err; } #ifdef CONFIG_SUSPEND_SMP -/* Needed to prevent the microcode driver from requesting firmware in its CPU - * hotplug notifier during the suspend/resume. - */ -int suspend_cpu_hotplug; -EXPORT_SYMBOL(suspend_cpu_hotplug); - static cpumask_t frozen_cpus; int disable_nonboot_cpus(void) @@ -267,7 +269,6 @@ int disable_nonboot_cpus(void) int cpu, first_cpu, error = 0; mutex_lock(&cpu_add_remove_lock); - suspend_cpu_hotplug = 1; first_cpu = first_cpu(cpu_online_map); /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -277,7 +278,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; - error = _cpu_down(cpu); + error = _cpu_down(cpu, 1); if (!error) { cpu_set(cpu, frozen_cpus); printk("CPU%d is down\n", cpu); @@ -294,7 +295,6 @@ int disable_nonboot_cpus(void) } else { printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } - suspend_cpu_hotplug = 0; mutex_unlock(&cpu_add_remove_lock); return error; } @@ -309,10 +309,9 @@ void enable_nonboot_cpus(void) if (cpus_empty(frozen_cpus)) goto out; - suspend_cpu_hotplug = 1; printk("Enabling non-boot CPUs ...\n"); for_each_cpu_mask(cpu, frozen_cpus) { - error = _cpu_up(cpu); + error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); continue; @@ -320,7 +319,6 @@ void enable_nonboot_cpus(void) printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } cpus_clear(frozen_cpus); - suspend_cpu_hotplug = 0; out: mutex_unlock(&cpu_add_remove_lock); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 88b416d..f57854b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf, { struct ctr_struct *ctr = file->private_data; - if (*ppos + nbytes > ctr->bufsz) - nbytes = ctr->bufsz - *ppos; - if (copy_to_user(buf, ctr->buf + *ppos, nbytes)) - return -EFAULT; - *ppos += nbytes; - return nbytes; + return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); } static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) diff --git a/kernel/exit.c b/kernel/exit.c index f5a7abb..b0c6f0c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -26,6 +26,7 @@ #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> +#include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> @@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. + * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd * * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. + * it ever exits, it should generally reparent itself to kthreadd so it + * isn't in the way of other processes and is correctly cleaned up on exit. * * The various task state such as scheduling policy and priority may have * been inherited from a user process, so we reset them to sane values here. * - * NOTE that reparent_to_init() gives the caller full capabilities. + * NOTE that reparent_to_kthreadd() gives the caller full capabilities. */ -static void reparent_to_init(void) +static void reparent_to_kthreadd(void) { write_lock_irq(&tasklist_lock); ptrace_unlink(current); /* Reparent to init */ remove_parent(current); - current->parent = child_reaper(current); - current->real_parent = child_reaper(current); + current->real_parent = current->parent = kthreadd_task; add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ @@ -347,7 +347,7 @@ int disallow_signal(int sig) return -EINVAL; spin_lock_irq(¤t->sighand->siglock); - sigaddset(¤t->blocked, sig); + current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); return 0; @@ -400,7 +400,7 @@ void daemonize(const char *name, ...) current->files = init_task.files; atomic_inc(¤t->files->count); - reparent_to_init(); + reparent_to_kthreadd(); } EXPORT_SYMBOL(daemonize); diff --git a/kernel/fork.c b/kernel/fork.c index a8dd75d..5dd3979 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep; void free_task(struct task_struct *tsk) { - free_thread_info(tsk->thread_info); + free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); free_task_struct(tsk); } @@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } *tsk = *orig; - tsk->thread_info = ti; + tsk->stack = ti; setup_thread_stack(tsk, orig); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/kernel/futex.c b/kernel/futex.c index 600bc9d..b7ce15c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -16,6 +16,9 @@ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -53,6 +56,12 @@ #include "rtmutex_common.h" +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -81,12 +90,12 @@ struct futex_pi_state { * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. - * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. + * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then * wake up q->waiters, then make the second condition true. */ struct futex_q { - struct list_head list; + struct plist_node list; wait_queue_head_t waiters; /* Which hash list lock to use: */ @@ -102,14 +111,20 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task; + + /* + * This waiter is used in case of requeue from a + * normal futex to a PI-futex + */ + struct rt_mutex_waiter waiter; }; /* * Split the global futex_lock into every hash list lock. */ struct futex_hash_bucket { - spinlock_t lock; - struct list_head chain; + spinlock_t lock; + struct plist_head chain; }; static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; @@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } -/* - * Get parameters which are the keys for a futex. +/** + * get_futex_key - Get parameters which are the keys for a futex. + * @uaddr: virtual address of the futex + * @shared: NULL for a PROCESS_PRIVATE futex, + * ¤t->mm->mmap_sem for a PROCESS_SHARED futex + * @key: address where result is stored. + * + * Returns a negative error code or 0 + * The key words are stored in *key on success. * * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * - * Returns: 0, or negative error code. - * The key words are stored in *key on success. - * - * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. + * fshared is NULL for PROCESS_PRIVATE futexes + * For other futexes, it points to ¤t->mm->mmap_sem and + * caller must have taken the reader lock. but NOT any spinlocks. */ -int get_futex_key(u32 __user *uaddr, union futex_key *key) +int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, + union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key) * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((key->both.offset % sizeof(u32)) != 0)) + if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + return -EFAULT; + key->private.mm = mm; + key->private.address = address; + return 0; + } + /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. */ @@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key) if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; + /* Save the user address in the ley */ + key->uaddr = uaddr; + /* * Private mappings are handled in a simple way. * @@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key) * mappings of _writable_ handles. */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { + key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ key->private.mm = mm; key->private.address = address; return 0; @@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key) * Linear file mappings are also simple. */ key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ + key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); @@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key); * Take a reference to the resource addressed by a key. * Can be called while holding spinlocks. * - * NOTE: mmap_sem MUST be held between get_futex_key() and calling this - * function, if it is called at all. mmap_sem keeps key->shared.inode valid. */ inline void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr != 0) { - if (key->both.offset & 1) + if (key->both.ptr == 0) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: atomic_inc(&key->shared.inode->i_count); - else + break; + case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); + break; } } EXPORT_SYMBOL_GPL(get_futex_key_refs); @@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs); */ void drop_futex_key_refs(union futex_key *key) { - if (key->both.ptr != 0) { - if (key->both.offset & 1) + if (key->both.ptr == 0) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: iput(key->shared.inode); - else + break; + case FUT_OFF_MMSHARED: mmdrop(key->private.mm); + break; } } EXPORT_SYMBOL_GPL(drop_futex_key_refs); @@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) } /* - * Fault handling. Called with current->mm->mmap_sem held. + * Fault handling. + * if fshared is non NULL, current->mm->mmap_sem is already held */ -static int futex_handle_fault(unsigned long address, int attempt) +static int futex_handle_fault(unsigned long address, + struct rw_semaphore *fshared, int attempt) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; + int ret = -EFAULT; - if (attempt > 2 || !(vma = find_vma(mm, address)) || - vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) - return -EFAULT; + if (attempt > 2) + return ret; - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: - return -EFAULT; + if (!fshared) + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); + if (vma && address >= vma->vm_start && + (vma->vm_flags & VM_WRITE)) { + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + ret = 0; + current->min_flt++; + break; + case VM_FAULT_MAJOR: + ret = 0; + current->maj_flt++; + break; + } } - return 0; + if (!fshared) + up_read(&mm->mmap_sem); + return ret; } /* @@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr) } static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; - struct list_head *head; + struct plist_head *head; struct task_struct *p; pid_t pid; head = &hb->chain; - list_for_each_entry_safe(this, next, head, list) { - if (match_futex(&this->key, &me->key)) { + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex(&this->key, key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) WARN_ON(!atomic_read(&pi_state->refcount)); atomic_inc(&pi_state->refcount); - me->pi_state = pi_state; + *ps = pi_state; return 0; } @@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); /* Store the key for possible exit cleanups: */ - pi_state->key = me->key; + pi_state->key = *key; spin_lock_irq(&p->pi_lock); WARN_ON(!list_empty(&pi_state->list)); @@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) put_task_struct(p); - me->pi_state = pi_state; + *ps = pi_state; return 0; } @@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) */ static void wake_futex(struct futex_q *q) { - list_del_init(&q->list); + plist_del(&q->list, &q->list.plist); if (q->filp) send_sigio(&q->filp->f_owner, q->fd, POLL_IN); /* * The lock in wake_up_all() is a crucial memory barrier after the - * list_del_init() and also before assigning to q->lock_ptr. + * plist_del() and also before assigning to q->lock_ptr. */ wake_up_all(&q->waiters); /* @@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) */ if (!(uval & FUTEX_OWNER_DIED)) { newval = FUTEX_WAITERS | new_owner->pid; + /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */ + newval |= (uval & FUTEX_WAITER_REQUEUED); pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(u32 __user *uaddr, int nr_wake) +static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, + int nr_wake) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct list_head *head; + struct plist_head *head; union futex_key key; int ret; - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &key); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) spin_lock(&hb->lock); head = &hb->chain; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { if (this->pi_state) { ret = -EINVAL; @@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) spin_unlock(&hb->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); + return ret; +} + +/* + * Called from futex_requeue_pi. + * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the + * PI-futex value; search its associated pi_state if an owner exist + * or create a new one without owner. + */ +static inline int +lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **pi_state) +{ + u32 curval, uval, newval; + +retry: + /* + * We can't handle a fault cleanly because we can't + * release the locks here. Simply return the fault. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */ + if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) + != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) { + /* + * No waiters yet, we prepare the futex to have some waiters. + */ + + uval = curval; + newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED; + + pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + } + + if (!(curval & FUTEX_TID_MASK) + || lookup_pi_state(curval, hb, key, pi_state)) { + /* the futex has no owner (yet) or the lookup failed: + allocate one pi_state without owner */ + + *pi_state = alloc_pi_state(); + + /* Already stores the key: */ + (*pi_state)->key = *key; + + /* init the mutex without owner */ + __rt_mutex_init(&(*pi_state)->pi_mutex, NULL); + } + + return 0; +} + +/* + * Keep the first nr_wake waiter from futex1, wake up one, + * and requeue the next nr_requeue waiters following hashed on + * one physical page to another physical page (PI-futex uaddr2) + */ +static int futex_requeue_pi(u32 __user *uaddr1, + struct rw_semaphore *fshared, + u32 __user *uaddr2, + int nr_wake, int nr_requeue, u32 *cmpval) +{ + union futex_key key1, key2; + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head1; + struct futex_q *this, *next; + struct futex_pi_state *pi_state2 = NULL; + struct rt_mutex_waiter *waiter, *top_waiter = NULL; + struct rt_mutex *lock2 = NULL; + int ret, drop_count = 0; + + if (refill_pi_state_cache()) + return -ENOMEM; + +retry: + /* + * First take all the futex related locks: + */ + if (fshared) + down_read(fshared); + + ret = get_futex_key(uaddr1, fshared, &key1); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, fshared, &key2); + if (unlikely(ret != 0)) + goto out; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + + double_lock_hb(hb1, hb2); + + if (likely(cmpval != NULL)) { + u32 curval; + + ret = get_futex_value_locked(&curval, uaddr1); + + if (unlikely(ret)) { + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); + + /* + * If we would have faulted, release mmap_sem, fault + * it in and start all over again. + */ + if (fshared) + up_read(fshared); + + ret = get_user(curval, uaddr1); + + if (!ret) + goto retry; + + return ret; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } + + head1 = &hb1->chain; + plist_for_each_entry_safe(this, next, head1, list) { + if (!match_futex (&this->key, &key1)) + continue; + if (++ret <= nr_wake) { + wake_futex(this); + } else { + /* + * FIRST: get and set the pi_state + */ + if (!pi_state2) { + int s; + /* do this only the first time we requeue someone */ + s = lookup_pi_state_for_requeue(uaddr2, hb2, + &key2, &pi_state2); + if (s) { + ret = s; + goto out_unlock; + } + + lock2 = &pi_state2->pi_mutex; + spin_lock(&lock2->wait_lock); + + /* Save the top waiter of the wait_list */ + if (rt_mutex_has_waiters(lock2)) + top_waiter = rt_mutex_top_waiter(lock2); + } else + atomic_inc(&pi_state2->refcount); + + + this->pi_state = pi_state2; + + /* + * SECOND: requeue futex_q to the correct hashbucket + */ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(head1 != &hb2->chain)) { + plist_del(&this->list, &hb1->chain); + plist_add(&this->list, &hb2->chain); + this->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + this->list.plist.lock = &hb2->lock; +#endif + } + this->key = key2; + get_futex_key_refs(&key2); + drop_count++; + + + /* + * THIRD: queue it to lock2 + */ + spin_lock_irq(&this->task->pi_lock); + waiter = &this->waiter; + waiter->task = this->task; + waiter->lock = lock2; + plist_node_init(&waiter->list_entry, this->task->prio); + plist_node_init(&waiter->pi_list_entry, this->task->prio); + plist_add(&waiter->list_entry, &lock2->wait_list); + this->task->pi_blocked_on = waiter; + spin_unlock_irq(&this->task->pi_lock); + + if (ret - nr_wake >= nr_requeue) + break; + } + } + + /* If we've requeued some tasks and the top_waiter of the rt_mutex + has changed, we must adjust the priority of the owner, if any */ + if (drop_count) { + struct task_struct *owner = rt_mutex_owner(lock2); + if (owner && + (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) { + int chain_walk = 0; + + spin_lock_irq(&owner->pi_lock); + if (top_waiter) + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + else + /* + * There was no waiters before the requeue, + * the flag must be updated + */ + mark_rt_mutex_waiters(lock2); + + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + chain_walk = 1; + get_task_struct(owner); + } + + spin_unlock_irq(&owner->pi_lock); + spin_unlock(&lock2->wait_lock); + + if (chain_walk) + rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL, + current); + } else { + /* No owner or the top_waiter does not change */ + mark_rt_mutex_waiters(lock2); + spin_unlock(&lock2->wait_lock); + } + } + +out_unlock: + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); + + /* drop_futex_key_refs() must be called outside the spinlocks. */ + while (--drop_count >= 0) + drop_futex_key_refs(&key1); + +out: + if (fshared) + up_read(fshared); return ret; } @@ -670,22 +985,24 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, + u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1, key2; struct futex_hash_bucket *hb1, *hb2; - struct list_head *head; + struct plist_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; retryfull: - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr1, &key1); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, &key2); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; @@ -725,11 +1042,10 @@ retry: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr2, - attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr2, + fshared, attempt); + if (ret) goto out; - } goto retry; } @@ -737,7 +1053,8 @@ retry: * If we would have faulted, release mmap_sem, * fault it in and start all over again. */ - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(dummy, uaddr2); if (ret) @@ -748,7 +1065,7 @@ retry: head = &hb1->chain; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { wake_futex(this); if (++ret >= nr_wake) @@ -760,7 +1077,7 @@ retry: head = &hb2->chain; op_ret = 0; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { wake_futex(this); if (++op_ret >= nr_wake2) @@ -774,7 +1091,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -782,22 +1100,24 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, +static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, + u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1, key2; struct futex_hash_bucket *hb1, *hb2; - struct list_head *head1; + struct plist_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; retry: - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr1, &key1); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, &key2); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; @@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(curval, uaddr1); @@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, } head1 = &hb1->chain; - list_for_each_entry_safe(this, next, head1, list) { + plist_for_each_entry_safe(this, next, head1, list) { if (!match_futex (&this->key, &key1)) continue; if (++ret <= nr_wake) { @@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, * requeue. */ if (likely(head1 != &hb2->chain)) { - list_move_tail(&this->list, &hb2->chain); + plist_del(&this->list, &hb1->chain); + plist_add(&this->list, &hb2->chain); this->lock_ptr = &hb2->lock; - } +#ifdef CONFIG_DEBUG_PI_LIST + this->list.plist.lock = &hb2->lock; +#endif + } this->key = key2; get_futex_key_refs(&key2); drop_count++; @@ -869,7 +1194,8 @@ out_unlock: drop_futex_key_refs(&key1); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { - list_add_tail(&q->list, &hb->chain); + int prio; + + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + */ + prio = min(current->normal_prio, MAX_RT_PRIO); + + plist_node_init(&q->list, prio); +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); } @@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q) spin_unlock(lock_ptr); goto retry; } - WARN_ON(list_empty(&q->list)); - list_del(&q->list); + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); BUG_ON(q->pi_state); @@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q) /* * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock is held on entry and dropped here. + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry + * and dropped here. */ -static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +static void unqueue_me_pi(struct futex_q *q) { - WARN_ON(list_empty(&q->list)); - list_del(&q->list); + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); q->pi_state = NULL; - spin_unlock(&hb->lock); + spin_unlock(q->lock_ptr); drop_futex_key_refs(&q->key); } +/* + * Fixup the pi_state owner with current. + * + * The cur->mm semaphore must be held, it is released at return of this + * function. + */ +static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, + struct futex_q *q, + struct futex_hash_bucket *hb, + struct task_struct *curr) +{ + u32 newtid = curr->pid | FUTEX_WAITERS; + struct futex_pi_state *pi_state = q->pi_state; + u32 uval, curval, newval; + int ret; + + /* Owner died? */ + if (pi_state->owner != NULL) { + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + pi_state->owner = curr; + + spin_lock_irq(&curr->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &curr->pi_state_list); + spin_unlock_irq(&curr->pi_lock); + + /* Unqueue and drop the lock */ + unqueue_me_pi(q); + if (fshared) + up_read(fshared); + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_user(uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + newval |= (uval & FUTEX_WAITER_REQUEUED); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + return ret; +} + +/* + * In case we must use restart_block to restart a futex_wait, + * we encode in the 'arg3' shared capability + */ +#define ARG3_SHARED 1 + static long futex_wait_restart(struct restart_block *restart); -static int futex_wait_abstime(u32 __user *uaddr, u32 val, - int timed, unsigned long abs_time) +static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, + u32 val, ktime_t *abs_time) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; - unsigned long time_left = 0; u32 uval; int ret; + struct hrtimer_sleeper t, *to = NULL; + int rem = 0; q.pi_state = NULL; retry: - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. * - * We hold the mmap semaphore, so the mapping cannot have changed - * since we looked it up in get_futex_key. + * for shared futexes, we hold the mmap semaphore, so the mapping + * cannot have changed since we looked it up in get_futex_key. */ ret = get_futex_value_locked(&uval, uaddr); @@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); @@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, if (uval != val) goto out_unlock_release_sem; + /* + * This rt_mutex_waiter structure is prepared here and will + * be used only if this task is requeued from a normal futex to + * a PI-futex with futex_requeue_pi. + */ + debug_rt_mutex_init_waiter(&q.waiter); + q.waiter.task = NULL; + /* Only actually queue if *uaddr contained val. */ __queue_me(&q, hb); @@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); /* * There might have been scheduling since the queue_me(), as we @@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&q.waiters, &wait); /* - * !list_empty() is safe here without any lock. + * !plist_node_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ - time_left = 0; - if (likely(!list_empty(&q.list))) { - unsigned long rel_time; - - if (timed) { - unsigned long now = jiffies; - if (time_after(now, abs_time)) - rel_time = 0; - else - rel_time = abs_time - now; - } else - rel_time = MAX_SCHEDULE_TIMEOUT; + if (likely(!plist_node_empty(&q.list))) { + if (!abs_time) + schedule(); + else { + to = &t; + hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(&t, current); + t.timer.expires = *abs_time; - time_left = schedule_timeout(rel_time); + hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); + + /* + * the timer could have already expired, in which + * case current would be flagged for rescheduling. + * Don't bother calling schedule. + */ + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + + /* Flag if a timeout occured */ + rem = (t.task == NULL); + } } __set_current_state(TASK_RUNNING); @@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, * we are the only user of it. */ + if (q.pi_state) { + /* + * We were woken but have been requeued on a PI-futex. + * We have to complete the lock acquisition by taking + * the rtmutex. + */ + + struct rt_mutex *lock = &q.pi_state->pi_mutex; + + spin_lock(&lock->wait_lock); + if (unlikely(q.waiter.task)) { + remove_waiter(lock, &q.waiter); + } + spin_unlock(&lock->wait_lock); + + if (rem) + ret = -ETIMEDOUT; + else + ret = rt_mutex_timed_lock(lock, to, 1); + + if (fshared) + down_read(fshared); + spin_lock(q.lock_ptr); + + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (!ret && q.pi_state->owner != curr) { + /* + * We MUST play with the futex we were requeued on, + * NOT the current futex. + * We can retrieve it from the key of the pi_state + */ + uaddr = q.pi_state->key.uaddr; + + /* mmap_sem and hash_bucket lock are unlocked at + return of this function */ + ret = fixup_pi_state_owner(uaddr, fshared, + &q, hb, curr); + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + if (fshared) + up_read(fshared); + } + + debug_rt_mutex_free_waiter(&q.waiter); + + return ret; + } + + debug_rt_mutex_free_waiter(&q.waiter); + /* If we were woken (and unqueued), we succeeded, whatever. */ if (!unqueue_me(&q)) return 0; - if (time_left == 0) + if (rem) return -ETIMEDOUT; /* * We expect signal_pending(current), but another thread may * have handled it for us already. */ - if (time_left == MAX_SCHEDULE_TIMEOUT) + if (!abs_time) return -ERESTARTSYS; else { struct restart_block *restart; @@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, restart->fn = futex_wait_restart; restart->arg0 = (unsigned long)uaddr; restart->arg1 = (unsigned long)val; - restart->arg2 = (unsigned long)timed; - restart->arg3 = abs_time; + restart->arg2 = (unsigned long)abs_time; + restart->arg3 = 0; + if (fshared) + restart->arg3 |= ARG3_SHARED; return -ERESTART_RESTARTBLOCK; } @@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val, queue_unlock(&q, hb); out_release_sem: - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } -static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time) -{ - int timed = (rel_time != MAX_SCHEDULE_TIMEOUT); - return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time); -} static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->arg0; u32 val = (u32)restart->arg1; - int timed = (int)restart->arg2; - unsigned long abs_time = restart->arg3; + ktime_t *abs_time = (ktime_t *)restart->arg2; + struct rw_semaphore *fshared = NULL; restart->fn = do_no_restart_syscall; - return (long)futex_wait_abstime(uaddr, val, timed, abs_time); + if (restart->arg3 & ARG3_SHARED) + fshared = ¤t->mm->mmap_sem; + return (long)futex_wait(uaddr, fshared, val, abs_time); } +static void set_pi_futex_owner(struct futex_hash_bucket *hb, + union futex_key *key, struct task_struct *p) +{ + struct plist_head *head; + struct futex_q *this, *next; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex *lock; + + /* Search a waiter that should already exists */ + + head = &hb->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, key)) { + pi_state = this->pi_state; + break; + } + } + + BUG_ON(!pi_state); + + /* set p as pi_state's owner */ + lock = &pi_state->pi_mutex; + + spin_lock(&lock->wait_lock); + spin_lock_irq(&p->pi_lock); + + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + + + /* set p as pi_mutex's owner */ + debug_rt_mutex_proxy_lock(lock, p); + WARN_ON(rt_mutex_owner(lock)); + rt_mutex_set_owner(lock, p, 0); + rt_mutex_deadlock_account_lock(lock, p); + + plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry, + &p->pi_waiters); + __rt_mutex_adjust_prio(p); + + spin_unlock_irq(&p->pi_lock); + spin_unlock(&lock->wait_lock); +} + /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, - long nsec, int trylock) +static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, + int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; struct futex_hash_bucket *hb; u32 uval, newval, curval; struct futex_q q; - int ret, attempt = 0; + int ret, lock_held, attempt = 0; if (refill_pi_state_cache()) return -ENOMEM; - if (sec != MAX_SCHEDULE_TIMEOUT) { + if (time) { to = &timeout; hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); - to->timer.expires = ktime_set(sec, nsec); + to->timer.expires = *time; } q.pi_state = NULL; retry: - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; hb = queue_lock(&q, -1, NULL); retry_locked: + lock_held = 0; + /* * To avoid races, we attempt to take the lock here again * (by doing a 0 -> TID atomic cmpxchg), while holding all @@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { if (!detect && 0) force_sig(SIGKILL, current); - ret = -EDEADLK; + /* + * Normally, this check is done in user space. + * In case of requeue, the owner may attempt to lock this futex, + * even if the ownership has already been given by the previous + * waker. + * In the usual case, this is a case of deadlock, but not in case + * of REQUEUE_PI. + */ + if (!(curval & FUTEX_WAITER_REQUEUED)) + ret = -EDEADLK; goto out_unlock_release_sem; } @@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, goto out_unlock_release_sem; uval = curval; - newval = uval | FUTEX_WAITERS; + /* + * In case of a requeue, check if there already is an owner + * If not, just take the futex. + */ + if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) { + /* set current as futex owner */ + newval = curval | current->pid; + lock_held = 1; + } else + /* Set the WAITERS flag, so the owner will know it has someone + to wake at next unlock */ + newval = curval | FUTEX_WAITERS; pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely(curval != uval)) goto retry_locked; + if (lock_held) { + set_pi_futex_owner(hb, &q.key, curr); + goto out_unlock_release_sem; + } + /* * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): */ - ret = lookup_pi_state(uval, hb, &q); + ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); if (unlikely(ret)) { /* @@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); WARN_ON(!q.pi_state); /* @@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, ret = ret ? 0 : -EWOULDBLOCK; } - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); spin_lock(q.lock_ptr); /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case. */ - if (!ret && q.pi_state->owner != curr) { - u32 newtid = current->pid | FUTEX_WAITERS; - - /* Owner died? */ - if (q.pi_state->owner != NULL) { - spin_lock_irq(&q.pi_state->owner->pi_lock); - WARN_ON(list_empty(&q.pi_state->list)); - list_del_init(&q.pi_state->list); - spin_unlock_irq(&q.pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; - - q.pi_state->owner = current; - - spin_lock_irq(¤t->pi_lock); - WARN_ON(!list_empty(&q.pi_state->list)); - list_add(&q.pi_state->list, ¤t->pi_state_list); - spin_unlock_irq(¤t->pi_lock); - - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); - /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. - */ - ret = get_user(uval, uaddr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - } else { + if (!ret && q.pi_state->owner != curr) + /* mmap_sem is unlocked at return of this function */ + ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr); + else { /* * Catch the rare case, where the lock was released * when we were on the way back before we locked @@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, ret = 0; } /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); + unqueue_me_pi(&q); + if (fshared) + up_read(fshared); } if (!detect && ret == -EDEADLK && 0) @@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, queue_unlock(&q, hb); out_release_sem: - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; uaddr_faulted: @@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr, fshared, + attempt); + if (ret) goto out_unlock_release_sem; - } goto retry_locked; } queue_unlock(&q, hb); - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr) +static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) { struct futex_hash_bucket *hb; struct futex_q *this, *next; u32 uval; - struct list_head *head; + struct plist_head *head; union futex_key key; int ret, attempt = 0; @@ -1399,9 +1932,10 @@ retry: /* * First take all the futex related locks: */ - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &key); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -1435,7 +1969,7 @@ retry_locked: */ head = &hb->chain; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (!match_futex (&this->key, &key)) continue; ret = wake_futex_pi(uaddr, uval, this); @@ -1460,7 +1994,8 @@ retry_locked: out_unlock: spin_unlock(&hb->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; @@ -1472,15 +2007,16 @@ pi_faulted: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr, fshared, + attempt); + if (ret) goto out_unlock; - } goto retry_locked; } spin_unlock(&hb->lock); - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp, poll_wait(filp, &q->waiters, wait); /* - * list_empty() is safe here without any lock. + * plist_node_empty() is safe here without any lock. * q->lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (list_empty(&q->list)) + if (plist_node_empty(&q->list)) ret = POLLIN | POLLRDNORM; return ret; @@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + struct rw_semaphore *fshared; static unsigned long printk_interval; if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { @@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal) } q->pi_state = NULL; - down_read(¤t->mm->mmap_sem); - err = get_futex_key(uaddr, &q->key); + fshared = ¤t->mm->mmap_sem; + down_read(fshared); + err = get_futex_key(uaddr, fshared, &q->key); if (unlikely(err != 0)) { - up_read(¤t->mm->mmap_sem); + up_read(fshared); kfree(q); goto error; } @@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal) filp->private_data = q; queue_me(q, ret, filp); - up_read(¤t->mm->mmap_sem); + up_read(fshared); /* Now we map fd to filp, so userspace can access it */ fd_install(ret, filp); @@ -1702,6 +2240,8 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + /* Also keep the FUTEX_WAITER_REQUEUED flag if set */ + mval |= (uval & FUTEX_WAITER_REQUEUED); nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); if (nval == -EFAULT) @@ -1716,7 +2256,7 @@ retry: */ if (!pi) { if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + futex_wake(uaddr, &curr->mm->mmap_sem, 1); } } return 0; @@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr) return; if (pending) - handle_futex_death((void __user *)pending + futex_offset, curr, pip); + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); while (entry != &head->list) { /* @@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr) } } -long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, +long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { int ret; + int cmd = op & FUTEX_CMD_MASK; + struct rw_semaphore *fshared = NULL; + + if (!(op & FUTEX_PRIVATE_FLAG)) + fshared = ¤t->mm->mmap_sem; - switch (op) { + switch (cmd) { case FUTEX_WAIT: - ret = futex_wait(uaddr, val, timeout); + ret = futex_wait(uaddr, fshared, val, timeout); break; case FUTEX_WAKE: - ret = futex_wake(uaddr, val); + ret = futex_wake(uaddr, fshared, val); break; case FUTEX_FD: /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ ret = futex_fd(uaddr, val); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); break; case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); + ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr); + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3); break; default: ret = -ENOSYS; @@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec __user *utime, u32 __user *uaddr2, u32 val3) { - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct timespec ts; + ktime_t t, *tp = NULL; u32 val2 = 0; + int cmd = op & FUTEX_CMD_MASK; - if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { - if (copy_from_user(&t, utime, sizeof(t)) != 0) + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { + if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; - if (!timespec_valid(&t)) + if (!timespec_valid(&ts)) return -EINVAL; - if (op == FUTEX_WAIT) - timeout = timespec_to_jiffies(&t) + 1; - else { - timeout = t.tv_sec; - val2 = t.tv_nsec; - } + + t = timespec_to_ktime(ts); + if (cmd == FUTEX_WAIT) + t = ktime_add(ktime_get(), t); + tp = &t; } /* - * requeue parameter in 'utime' if op == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. */ - if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE + || cmd == FUTEX_CMP_REQUEUE_PI) val2 = (u32) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } static int futexfs_get_sb(struct file_system_type *fs_type, @@ -1895,7 +2445,7 @@ static int __init init(void) } for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - INIT_LIST_HEAD(&futex_queues[i].chain); + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); spin_lock_init(&futex_queues[i].lock); } return 0; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 50f24ee..338a9b4 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, struct compat_timespec __user *utime, u32 __user *uaddr2, u32 val3) { - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct timespec ts; + ktime_t t, *tp = NULL; int val2 = 0; if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { - if (get_compat_timespec(&t, utime)) + if (get_compat_timespec(&ts, utime)) return -EFAULT; - if (!timespec_valid(&t)) + if (!timespec_valid(&ts)) return -EINVAL; + + t = timespec_to_ktime(ts); if (op == FUTEX_WAIT) - timeout = timespec_to_jiffies(&t) + 1; - else { - timeout = t.tv_sec; - val2 = t.tv_nsec; - } + t = ktime_add(ktime_get(), t); + tp = &t; } - if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE + || op == FUTEX_CMP_REQUEUE_PI) val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c9f4f04..23c03f4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: init_hrtimers_cpu(cpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + case CPU_DEAD_FROZEN: clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); migrate_hrtimers(cpu); break; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 32e1ab1..e391cbb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -22,7 +22,6 @@ * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number * @desc: description of the interrupt - * @regs: pointer to a register structure * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 49cc4b9..4d32eb0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data) /* Unblock all signals and set the session keyring. */ new_session = key_get(sub_info->ring); - flush_signals(current); spin_lock_irq(¤t->sighand->siglock); old_session = __install_session_keyring(current, new_session); flush_signal_handlers(current, 1); @@ -186,14 +185,9 @@ static int wait_for_helper(void *data) { struct subprocess_info *sub_info = data; pid_t pid; - struct k_sigaction sa; /* Install a handler: if SIGCLD isn't handled sys_wait4 won't * populate the status, but will return -ECHILD. */ - sa.sa.sa_handler = SIG_IGN; - sa.sa.sa_flags = 0; - siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); - do_sigaction(SIGCHLD, &sa, NULL); allow_signal(SIGCHLD); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); diff --git a/kernel/kthread.c b/kernel/kthread.c index 87c50cc..df8a8e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,7 +1,7 @@ /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * - * Creation is done via keventd, so that we get a clean environment + * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ @@ -15,24 +15,22 @@ #include <linux/mutex.h> #include <asm/semaphore.h> -/* - * We dont want to execute off keventd since it might - * hold a semaphore our callers hold too: - */ -static struct workqueue_struct *helper_wq; +static DEFINE_SPINLOCK(kthread_create_lock); +static LIST_HEAD(kthread_create_list); +struct task_struct *kthreadd_task; struct kthread_create_info { - /* Information passed to kthread() from keventd. */ + /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; struct completion started; - /* Result passed back to kthread_create() from keventd. */ + /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion done; - struct work_struct work; + struct list_head list; }; struct kthread_stop_info @@ -60,42 +58,17 @@ int kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); -static void kthread_exit_files(void) -{ - struct fs_struct *fs; - struct task_struct *tsk = current; - - exit_fs(tsk); /* current->fs->count--; */ - fs = init_task.fs; - tsk->fs = fs; - atomic_inc(&fs->count); - exit_files(tsk); - current->files = init_task.files; - atomic_inc(&tsk->files->count); -} - static int kthread(void *_create) { struct kthread_create_info *create = _create; int (*threadfn)(void *data); void *data; - sigset_t blocked; int ret = -EINTR; - kthread_exit_files(); - - /* Copy data: it's on keventd's stack */ + /* Copy data: it's on kthread's stack */ threadfn = create->threadfn; data = create->data; - /* Block and flush all signals (in case we're not from keventd). */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* By default we can run anywhere, unlike keventd. */ - set_cpus_allowed(current, CPU_MASK_ALL); - /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_INTERRUPTIBLE); complete(&create->started); @@ -112,11 +85,8 @@ static int kthread(void *_create) return 0; } -/* We are keventd: create a thread. */ -static void keventd_create_kthread(struct work_struct *work) +static void create_kthread(struct kthread_create_info *create) { - struct kthread_create_info *create = - container_of(work, struct kthread_create_info, work); int pid; /* We want our own signal handler (we take no signals by default). */ @@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), create.data = data; init_completion(&create.started); init_completion(&create.done); - INIT_WORK(&create.work, keventd_create_kthread); - - /* - * The workqueue needs to start up first: - */ - if (!helper_wq) - create.work.func(&create.work); - else { - queue_work(helper_wq, &create.work); - wait_for_completion(&create.done); - } + + spin_lock(&kthread_create_lock); + list_add_tail(&create.list, &kthread_create_list); + wake_up_process(kthreadd_task); + spin_unlock(&kthread_create_lock); + + wait_for_completion(&create.done); + if (!IS_ERR(create.result)) { va_list args; va_start(args, namefmt); @@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), namefmt, args); va_end(args); } - return create.result; } EXPORT_SYMBOL(kthread_create); @@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k) } EXPORT_SYMBOL(kthread_stop); -static __init int helper_init(void) + +static __init void kthreadd_setup(void) { - helper_wq = create_singlethread_workqueue("kthread"); - BUG_ON(!helper_wq); + struct task_struct *tsk = current; - return 0; + set_task_comm(tsk, "kthreadd"); + + ignore_signals(tsk); + + set_user_nice(tsk, -5); + set_cpus_allowed(tsk, CPU_MASK_ALL); } -core_initcall(helper_init); +int kthreadd(void *unused) +{ + /* Setup a clean context for our children to inherit. */ + kthreadd_setup(); + + current->flags |= PF_NOFREEZE; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&kthread_create_list)) + schedule(); + __set_current_state(TASK_RUNNING); + + spin_lock(&kthread_create_lock); + while (!list_empty(&kthread_create_list)) { + struct kthread_create_info *create; + + create = list_entry(kthread_create_list.next, + struct kthread_create_info, list); + list_del_init(&create->list); + spin_unlock(&kthread_create_lock); + + create_kthread(create); + + spin_lock(&kthread_create_lock); + } + spin_unlock(&kthread_create_lock); + } + + return 0; +} diff --git a/kernel/module.c b/kernel/module.c index d36e454..9bd93de 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -96,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag) mod->taints |= flag; } -/* A thread that wants to hold a reference to a module only while it - * is running can call ths to safely exit. - * nfsd and lockd use this. +/* + * A thread that wants to hold a reference to a module only while it + * is running can call this to safely exit. nfsd and lockd use this. */ void __module_put_and_exit(struct module *mod, long code) { @@ -1199,7 +1199,7 @@ static int __unlink_module(void *_mod) return 0; } -/* Free a module, remove from lists, etc (must hold module mutex). */ +/* Free a module, remove from lists, etc (must hold module_mutex). */ static void free_module(struct module *mod) { /* Delete from various lists */ @@ -1246,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get); /* * Ensure that an exported symbol [global namespace] does not already exist - * in the Kernel or in some other modules exported symbol table. + * in the kernel or in some other module's exported symbol table. */ static int verify_export_symbols(struct module *mod) { diff --git a/kernel/mutex.c b/kernel/mutex.c index e7cbbb8..303eab1 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) debug_mutex_lock_common(lock, &waiter); mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - debug_mutex_add_waiter(lock, &waiter, task->thread_info); + debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) */ if (unlikely(state == TASK_INTERRUPTIBLE && signal_pending(task))) { - mutex_remove_waiter(lock, &waiter, task->thread_info); + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); mutex_release(&lock->dep_map, 1, _RET_IP_); spin_unlock_mutex(&lock->wait_lock, flags); @@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) } /* got the lock - rejoice! */ - mutex_remove_waiter(lock, &waiter, task->thread_info); - debug_mutex_set_owner(lock, task->thread_info); + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_set_owner(lock, task_thread_info(task)); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0633137..b5f0543 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; +enum { + HIBERNATION_INVALID, + HIBERNATION_PLATFORM, + HIBERNATION_TEST, + HIBERNATION_TESTPROC, + HIBERNATION_SHUTDOWN, + HIBERNATION_REBOOT, + /* keep last */ + __HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +struct hibernation_ops *hibernation_ops; + +/** + * hibernation_set_ops - set the global hibernate operations + * @ops: the hibernation operations to use in subsequent hibernation transitions + */ + +void hibernation_set_ops(struct hibernation_ops *ops) +{ + if (ops && !(ops->prepare && ops->enter && ops->finish)) { + WARN_ON(1); + return; + } + mutex_lock(&pm_mutex); + hibernation_ops = ops; + if (ops) + hibernation_mode = HIBERNATION_PLATFORM; + else if (hibernation_mode == HIBERNATION_PLATFORM) + hibernation_mode = HIBERNATION_SHUTDOWN; + + mutex_unlock(&pm_mutex); +} + + /** * platform_prepare - prepare the machine for hibernation using the * platform driver if so configured and return an error code if it fails */ -static inline int platform_prepare(void) +static int platform_prepare(void) { - int error = 0; + return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? + hibernation_ops->prepare() : 0; +} - switch (pm_disk_mode) { - case PM_DISK_TEST: - case PM_DISK_TESTPROC: - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - break; - default: - if (pm_ops && pm_ops->prepare) - error = pm_ops->prepare(PM_SUSPEND_DISK); - } - return error; +/** + * platform_finish - switch the machine to the normal mode of operation + * using the platform driver (must be called after platform_prepare()) + */ + +static void platform_finish(void) +{ + if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) + hibernation_ops->finish(); } /** - * power_down - Shut machine down for hibernate. + * power_down - Shut the machine down for hibernation. * * Use the platform driver, if configured so; otherwise try * to power off or reboot. @@ -61,20 +100,20 @@ static inline int platform_prepare(void) static void power_down(void) { - switch (pm_disk_mode) { - case PM_DISK_TEST: - case PM_DISK_TESTPROC: + switch (hibernation_mode) { + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: break; - case PM_DISK_SHUTDOWN: + case HIBERNATION_SHUTDOWN: kernel_power_off(); break; - case PM_DISK_REBOOT: + case HIBERNATION_REBOOT: kernel_restart(NULL); break; - default: - if (pm_ops && pm_ops->enter) { + case HIBERNATION_PLATFORM: + if (hibernation_ops) { kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - pm_ops->enter(PM_SUSPEND_DISK); + hibernation_ops->enter(); break; } } @@ -87,20 +126,6 @@ static void power_down(void) while(1); } -static inline void platform_finish(void) -{ - switch (pm_disk_mode) { - case PM_DISK_TEST: - case PM_DISK_TESTPROC: - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - break; - default: - if (pm_ops && pm_ops->finish) - pm_ops->finish(PM_SUSPEND_DISK); - } -} - static void unprepare_processes(void) { thaw_processes(); @@ -120,13 +145,10 @@ static int prepare_processes(void) } /** - * pm_suspend_disk - The granpappy of hibernation power management. - * - * If not, then call swsusp to do its thing, then figure out how - * to power down the system. + * hibernate - The granpappy of the built-in hibernation management */ -int pm_suspend_disk(void) +int hibernate(void) { int error; @@ -143,7 +165,8 @@ int pm_suspend_disk(void) if (error) goto Finish; - if (pm_disk_mode == PM_DISK_TESTPROC) { + mutex_lock(&pm_mutex); + if (hibernation_mode == HIBERNATION_TESTPROC) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); goto Thaw; @@ -168,7 +191,7 @@ int pm_suspend_disk(void) if (error) goto Enable_cpus; - if (pm_disk_mode == PM_DISK_TEST) { + if (hibernation_mode == HIBERNATION_TEST) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); goto Enable_cpus; @@ -205,6 +228,7 @@ int pm_suspend_disk(void) device_resume(); resume_console(); Thaw: + mutex_unlock(&pm_mutex); unprepare_processes(); Finish: free_basic_memory_bitmaps(); @@ -220,7 +244,7 @@ int pm_suspend_disk(void) * Called as a late_initcall (so all devices are discovered and * initialized), we call swsusp to see if we have a saved image or not. * If so, we quiesce devices, the restore the saved image. We will - * return above (in pm_suspend_disk() ) if everything goes well. + * return above (in hibernate() ) if everything goes well. * Otherwise, we fail gracefully and return to the normally * scheduled program. * @@ -315,25 +339,26 @@ static int software_resume(void) late_initcall(software_resume); -static const char * const pm_disk_modes[] = { - [PM_DISK_PLATFORM] = "platform", - [PM_DISK_SHUTDOWN] = "shutdown", - [PM_DISK_REBOOT] = "reboot", - [PM_DISK_TEST] = "test", - [PM_DISK_TESTPROC] = "testproc", +static const char * const hibernation_modes[] = { + [HIBERNATION_PLATFORM] = "platform", + [HIBERNATION_SHUTDOWN] = "shutdown", + [HIBERNATION_REBOOT] = "reboot", + [HIBERNATION_TEST] = "test", + [HIBERNATION_TESTPROC] = "testproc", }; /** - * disk - Control suspend-to-disk mode + * disk - Control hibernation mode * * Suspend-to-disk can be handled in several ways. We have a few options * for putting the system to sleep - using the platform driver (e.g. ACPI - * or other pm_ops), powering off the system or rebooting the system - * (for testing) as well as the two test modes. + * or other hibernation_ops), powering off the system or rebooting the + * system (for testing) as well as the two test modes. * * The system can support 'platform', and that is known a priori (and - * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot' - * as alternatives, as well as the test modes 'test' and 'testproc'. + * encoded by the presence of hibernation_ops). However, the user may + * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the + * test modes, 'test' or 'testproc'. * * show() will display what the mode is currently set to. * store() will accept one of @@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = { * 'testproc' * * It will only change to 'platform' if the system - * supports it (as determined from pm_ops->pm_disk_mode). + * supports it (as determined by having hibernation_ops). */ static ssize_t disk_show(struct kset *kset, char *buf) @@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf) int i; char *start = buf; - for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { - if (!pm_disk_modes[i]) + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!hibernation_modes[i]) continue; switch (i) { - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - case PM_DISK_TEST: - case PM_DISK_TESTPROC: + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: break; - default: - if (pm_ops && pm_ops->enter && - (i == pm_ops->pm_disk_mode)) + case HIBERNATION_PLATFORM: + if (hibernation_ops) break; /* not a valid mode, continue with loop */ continue; } - if (i == pm_disk_mode) - buf += sprintf(buf, "[%s]", pm_disk_modes[i]); + if (i == hibernation_mode) + buf += sprintf(buf, "[%s] ", hibernation_modes[i]); else - buf += sprintf(buf, "%s", pm_disk_modes[i]); - if (i+1 != PM_DISK_MAX) - buf += sprintf(buf, " "); + buf += sprintf(buf, "%s ", hibernation_modes[i]); } buf += sprintf(buf, "\n"); return buf-start; @@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) int i; int len; char *p; - suspend_disk_method_t mode = 0; + int mode = HIBERNATION_INVALID; p = memchr(buf, '\n', n); len = p ? p - buf : n; mutex_lock(&pm_mutex); - for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { - if (!strncmp(buf, pm_disk_modes[i], len)) { + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!strncmp(buf, hibernation_modes[i], len)) { mode = i; break; } } - if (mode) { + if (mode != HIBERNATION_INVALID) { switch (mode) { - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - case PM_DISK_TEST: - case PM_DISK_TESTPROC: - pm_disk_mode = mode; + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + hibernation_mode = mode; break; - default: - if (pm_ops && pm_ops->enter && - (mode == pm_ops->pm_disk_mode)) - pm_disk_mode = mode; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + hibernation_mode = mode; else error = -EINVAL; } - } else { + } else error = -EINVAL; - } - pr_debug("PM: suspend-to-disk mode set to '%s'\n", - pm_disk_modes[mode]); + if (!error) + pr_debug("PM: suspend-to-disk mode set to '%s'\n", + hibernation_modes[mode]); mutex_unlock(&pm_mutex); return error ? error : n; } diff --git a/kernel/power/main.c b/kernel/power/main.c index f6dda68..40d56a3 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -30,7 +30,6 @@ DEFINE_MUTEX(pm_mutex); struct pm_ops *pm_ops; -suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; /** * pm_set_ops - Set the global power method table. @@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops) { mutex_lock(&pm_mutex); pm_ops = ops; - if (ops && ops->pm_disk_mode != PM_DISK_INVALID) { - pm_disk_mode = ops->pm_disk_mode; - } else - pm_disk_mode = PM_DISK_SHUTDOWN; mutex_unlock(&pm_mutex); } @@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state) static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", - [PM_SUSPEND_DISK] = "disk", }; static inline int valid_state(suspend_state_t state) { - /* Suspend-to-disk does not really need low-level support. - * It can work with shutdown/reboot if needed. If it isn't - * configured, then it cannot be supported. - */ - if (state == PM_SUSPEND_DISK) -#ifdef CONFIG_SOFTWARE_SUSPEND - return 1; -#else - return 0; -#endif - - /* all other states need lowlevel support and need to be - * valid to the lowlevel implementation, no valid callback + /* All states need lowlevel support and need to be valid + * to the lowlevel implementation, no valid callback * implies that none are valid. */ if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state)) return 0; @@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state) if (!mutex_trylock(&pm_mutex)) return -EBUSY; - if (state == PM_SUSPEND_DISK) { - error = pm_suspend_disk(); - goto Unlock; - } - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); if ((error = suspend_prepare(state))) goto Unlock; @@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state) /** * pm_suspend - Externally visible function for suspending system. - * @state: Enumarted value of state to enter. + * @state: Enumerated value of state to enter. * * Determine whether or not value is within range, get state * structure, and enter (above). @@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf) if (pm_states[i] && valid_state(i)) s += sprintf(s,"%s ", pm_states[i]); } - s += sprintf(s,"\n"); +#ifdef CONFIG_SOFTWARE_SUSPEND + s += sprintf(s, "%s\n", "disk"); +#else + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; +#endif return (s - buf); } @@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) p = memchr(buf, '\n', n); len = p ? p - buf : n; + /* First, check if we are requested to hibernate */ + if (!strncmp(buf, "disk", len)) { + error = hibernate(); + return error ? error : n; + } + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { if (*s && !strncmp(buf, *s, len)) break; diff --git a/kernel/power/power.h b/kernel/power/power.h index 34b4354..5138148 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -25,12 +25,7 @@ struct swsusp_info { */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) -extern int pm_suspend_disk(void); -#else -static inline int pm_suspend_disk(void) -{ - return -EPERM; -} +extern struct hibernation_ops *hibernation_ops; #endif extern int pfn_is_nosave(unsigned long); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 59fb89b..a3b7854 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1233,7 +1233,7 @@ asmlinkage int swsusp_save(void) nr_copy_pages = nr_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + printk("swsusp: critical section: done (%d pages copied)\n", nr_pages); return 0; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 040560d..24d7d78 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -130,16 +130,16 @@ static inline int platform_prepare(void) { int error = 0; - if (pm_ops && pm_ops->prepare) - error = pm_ops->prepare(PM_SUSPEND_DISK); + if (hibernation_ops) + error = hibernation_ops->prepare(); return error; } static inline void platform_finish(void) { - if (pm_ops && pm_ops->finish) - pm_ops->finish(PM_SUSPEND_DISK); + if (hibernation_ops) + hibernation_ops->finish(); } static inline int snapshot_suspend(int platform_suspend) @@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, switch (arg) { case PMOPS_PREPARE: - if (pm_ops && pm_ops->enter) { + if (hibernation_ops) { data->platform_suspend = 1; error = 0; } else { @@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case PMOPS_ENTER: if (data->platform_suspend) { kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = pm_ops->enter(PM_SUSPEND_DISK); - error = 0; + error = hibernation_ops->enter(); } break; diff --git a/kernel/profile.c b/kernel/profile.c index 9bfadb2..cc91b9b 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { @@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, __free_page(page); return NOTIFY_BAD; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: cpu_set(cpu, prof_cpu_mask); break; case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: + case CPU_DEAD_FROZEN: cpu_clear(cpu, prof_cpu_mask); if (per_cpu(cpu_profile_hits, cpu)[0]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 3554b76..2c2dd84 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; case CPU_DEAD: + case CPU_DEAD_FROZEN: rcu_offline_cpu(cpu); break; default: diff --git a/kernel/relay.c b/kernel/relay.c index 577f251..4311101 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = { /** * wakeup_readers - wake up readers waiting on a channel - * @work: work struct that contains the the channel buffer + * @data: contains the channel buffer * - * This is the work function used to defer reader waking. The - * reason waking is deferred is that calling directly from write - * causes problems if you're writing from say the scheduler. + * This is the timer function used to defer reader waking. */ -static void wakeup_readers(struct work_struct *work) +static void wakeup_readers(unsigned long data) { - struct rchan_buf *buf = - container_of(work, struct rchan_buf, wake_readers.work); + struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); } @@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) if (init) { init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); - INIT_DELAYED_WORK(&buf->wake_readers, NULL); - } else { - cancel_delayed_work(&buf->wake_readers); - flush_scheduled_work(); - } + setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + } else + del_timer_sync(&buf->timer); buf->subbufs_produced = 0; buf->subbufs_consumed = 0; @@ -447,8 +442,7 @@ end: static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; - cancel_delayed_work(&buf->wake_readers); - flush_scheduled_work(); + del_timer_sync(&buf->timer); kref_put(&buf->kref, relay_remove_buf); } @@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, switch(action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { if (chan->buf[hotcpu]) @@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, mutex_unlock(&relay_channels_mutex); break; case CPU_DEAD: + case CPU_DEAD_FROZEN: /* No need to flush the cpu : will be flushed upon * final relay_flush() call. */ break; @@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->dentry->d_inode->i_size += buf->chan->subbuf_size - buf->padding[old_subbuf]; smp_mb(); - if (waitqueue_active(&buf->read_wait)) { - PREPARE_DELAYED_WORK(&buf->wake_readers, - wakeup_readers); - schedule_delayed_work(&buf->wake_readers, 1); - } + if (waitqueue_active(&buf->read_wait)) + /* + * Calling wake_up_interruptible() from here + * will deadlock if we happen to be logging + * from the scheduler (trying to re-grab + * rq->lock), so defer it. + */ + __mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 180978c..12879f6 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -56,7 +56,7 @@ * state. */ -static void +void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, unsigned long mask) { @@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) } /* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up - */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner, *p = (unsigned long *) &lock->owner; - - do { - owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); -} -#else -# define rt_mutex_cmpxchg(l,c,n) (0) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); -} -#endif - -/* * Calculate task priority from the waiter list priority * * Return task->normal_prio when the waiter list is empty or when @@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task) * * This can be both boosting and unboosting. task->pi_lock must be held. */ -static void __rt_mutex_adjust_prio(struct task_struct *task) +void __rt_mutex_adjust_prio(struct task_struct *task) { int prio = rt_mutex_getprio(task); @@ -159,11 +136,11 @@ int max_lock_depth = 1024; * Decreases task's usage by one - may thus free the task. * Returns 0 or -EDEADLK. */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, - struct rt_mutex *orig_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) +int rt_mutex_adjust_prio_chain(struct task_struct *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task) { struct rt_mutex *lock; struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; @@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * * Must be called with lock->wait_lock held */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) +void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 9c75856..242ec7e 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) } /* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); @@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); + +extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask); +extern void __rt_mutex_adjust_prio(struct task_struct *task); +extern int rt_mutex_adjust_prio_chain(struct task_struct *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task); +extern void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); #endif diff --git a/kernel/sched.c b/kernel/sched.c index 66bd7ff..799d23b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -305,6 +305,7 @@ struct rq { }; static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_MUTEX(sched_hotcpu_mutex); static inline int cpu_of(struct rq *rq) { @@ -4520,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) struct task_struct *p; int retval; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (!p) { read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return -ESRCH; } @@ -4553,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) out_unlock: put_task_struct(p); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return retval; } @@ -4610,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) struct task_struct *p; int retval; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); read_lock(&tasklist_lock); retval = -ESRCH; @@ -4626,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) out_unlock: read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); if (retval) return retval; @@ -5388,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) struct rq *rq; switch (action) { + case CPU_LOCK_ACQUIRE: + mutex_lock(&sched_hotcpu_mutex); + break; + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; @@ -5402,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: /* Strictly unneccessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!cpu_rq(cpu)->migration_thread) break; /* Unbind it from offline cpu so it can run. Fall thru. */ @@ -5418,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DEAD: + case CPU_DEAD_FROZEN: migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); @@ -5433,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 0); /* No need to migrate the tasks: it was best-effort if - * they didn't do lock_cpu_hotplug(). Just wake up + * they didn't take sched_hotcpu_mutex. Just wake up * the requestors. */ spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { @@ -5447,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_unlock_irq(&rq->lock); break; #endif + case CPU_LOCK_RELEASE: + mutex_unlock(&sched_hotcpu_mutex); + break; } return NOTIFY_OK; } @@ -6822,10 +6834,10 @@ int arch_reinit_sched_domains(void) { int err; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); detach_destroy_domains(&cpu_online_map); err = arch_init_sched_domains(&cpu_online_map); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return err; } @@ -6904,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb, { switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); return NOTIFY_OK; case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: + case CPU_ONLINE_FROZEN: case CPU_DEAD: + case CPU_DEAD_FROZEN: /* * Fall through and re-initialise the domains. */ @@ -6930,12 +6948,12 @@ void __init sched_init_smp(void) { cpumask_t non_isolated_cpus; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); diff --git a/kernel/signal.c b/kernel/signal.c index 1368e67..2ac3a66 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -38,125 +38,6 @@ static struct kmem_cache *sigqueue_cachep; -/* - * In POSIX a signal is sent either to a specific thread (Linux task) - * or to the process as a whole (Linux thread group). How the signal - * is sent determines whether it's to one thread or the whole group, - * which determines which signal mask(s) are involved in blocking it - * from being delivered until later. When the signal is delivered, - * either it's caught or ignored by a user handler or it has a default - * effect that applies to the whole thread group (POSIX process). - * - * The possible effects an unblocked signal set to SIG_DFL can have are: - * ignore - Nothing Happens - * terminate - kill the process, i.e. all threads in the group, - * similar to exit_group. The group leader (only) reports - * WIFSIGNALED status to its parent. - * coredump - write a core dump file describing all threads using - * the same mm and then kill all those threads - * stop - stop all the threads in the group, i.e. TASK_STOPPED state - * - * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. - * Other signals when not blocked and set to SIG_DFL behaves as follows. - * The job control signals also have other special effects. - * - * +--------------------+------------------+ - * | POSIX signal | default action | - * +--------------------+------------------+ - * | SIGHUP | terminate | - * | SIGINT | terminate | - * | SIGQUIT | coredump | - * | SIGILL | coredump | - * | SIGTRAP | coredump | - * | SIGABRT/SIGIOT | coredump | - * | SIGBUS | coredump | - * | SIGFPE | coredump | - * | SIGKILL | terminate(+) | - * | SIGUSR1 | terminate | - * | SIGSEGV | coredump | - * | SIGUSR2 | terminate | - * | SIGPIPE | terminate | - * | SIGALRM | terminate | - * | SIGTERM | terminate | - * | SIGCHLD | ignore | - * | SIGCONT | ignore(*) | - * | SIGSTOP | stop(*)(+) | - * | SIGTSTP | stop(*) | - * | SIGTTIN | stop(*) | - * | SIGTTOU | stop(*) | - * | SIGURG | ignore | - * | SIGXCPU | coredump | - * | SIGXFSZ | coredump | - * | SIGVTALRM | terminate | - * | SIGPROF | terminate | - * | SIGPOLL/SIGIO | terminate | - * | SIGSYS/SIGUNUSED | coredump | - * | SIGSTKFLT | terminate | - * | SIGWINCH | ignore | - * | SIGPWR | terminate | - * | SIGRTMIN-SIGRTMAX | terminate | - * +--------------------+------------------+ - * | non-POSIX signal | default action | - * +--------------------+------------------+ - * | SIGEMT | coredump | - * +--------------------+------------------+ - * - * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". - * (*) Special job control effects: - * When SIGCONT is sent, it resumes the process (all threads in the group) - * from TASK_STOPPED state and also clears any pending/queued stop signals - * (any of those marked with "stop(*)"). This happens regardless of blocking, - * catching, or ignoring SIGCONT. When any stop signal is sent, it clears - * any pending/queued SIGCONT signals; this happens regardless of blocking, - * catching, or ignored the stop signal, though (except for SIGSTOP) the - * default action of stopping the process may happen later or never. - */ - -#ifdef SIGEMT -#define M_SIGEMT M(SIGEMT) -#else -#define M_SIGEMT 0 -#endif - -#if SIGRTMIN > BITS_PER_LONG -#define M(sig) (1ULL << ((sig)-1)) -#else -#define M(sig) (1UL << ((sig)-1)) -#endif -#define T(sig, mask) (M(sig) & (mask)) - -#define SIG_KERNEL_ONLY_MASK (\ - M(SIGKILL) | M(SIGSTOP) ) - -#define SIG_KERNEL_STOP_MASK (\ - M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) ) - -#define SIG_KERNEL_COREDUMP_MASK (\ - M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \ - M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \ - M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT ) - -#define SIG_KERNEL_IGNORE_MASK (\ - M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) ) - -#define sig_kernel_only(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK)) -#define sig_kernel_coredump(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK)) -#define sig_kernel_ignore(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK)) -#define sig_kernel_stop(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) - -#define sig_needs_tasklist(sig) ((sig) == SIGCONT) - -#define sig_user_defined(t, signr) \ - (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ - ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) - -#define sig_fatal(t, signr) \ - (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ - (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) static int sig_ignored(struct task_struct *t, int sig) { @@ -328,6 +209,16 @@ void flush_signals(struct task_struct *t) spin_unlock_irqrestore(&t->sighand->siglock, flags); } +void ignore_signals(struct task_struct *t) +{ + int i; + + for (i = 0; i < _NSIG; ++i) + t->sighand->action[i].sa.sa_handler = SIG_IGN; + + flush_signals(t); +} + /* * Flush all handlers for a task. */ @@ -1032,17 +923,6 @@ void zap_other_threads(struct task_struct *p) if (t->exit_state) continue; - /* - * We don't want to notify the parent, since we are - * killed as part of a thread group due to another - * thread doing an execve() or similar. So set the - * exit signal to -1 to allow immediate reaping of - * the process. But don't detach the thread group - * leader. - */ - if (t != p->group_leader) - t->exit_signal = -1; - /* SIGKILL will be handled before any pending SIGSTOP */ sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); diff --git a/kernel/softirq.c b/kernel/softirq.c index 8b75008..0b9886a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); @@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, per_cpu(ksoftirqd, hotcpu) = p; break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: wake_up_process(per_cpu(ksoftirqd, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!per_cpu(ksoftirqd, hotcpu)) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); case CPU_DEAD: + case CPU_DEAD_FROZEN: p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; kthread_stop(p); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 8fa7040..0131e29 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: BUG_ON(per_cpu(watchdog_task, hotcpu)); p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); if (IS_ERR(p)) { @@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) kthread_bind(p, hotcpu); break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!per_cpu(watchdog_task, hotcpu)) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), any_online_cpu(cpu_online_map)); case CPU_DEAD: + case CPU_DEAD_FROZEN: p = per_cpu(watchdog_task, hotcpu); per_cpu(watchdog_task, hotcpu) = NULL; kthread_stop(p); diff --git a/kernel/sys.c b/kernel/sys.c index 926bf9d..cdb7e94 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl, return -ENOENT; } +/** + * notifier_call_chain - Informs the registered notifiers about an event. + * @nl: Pointer to head of the blocking notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: Number of notifier functions to be called. Don't care + * value of this parameter is -1. + * @nr_calls: Records the number of notifications sent. Don't care + * value of this field is NULL. + * @returns: notifier_call_chain returns the value returned by the + * last notifier function called. + */ + static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v) + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; nb = rcu_dereference(*nl); - while (nb) { + + while (nb && nr_to_call) { next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); + + if (nr_calls) + (*nr_calls)++; + if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; nb = next_nb; + nr_to_call--; } return ret; } @@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** - * atomic_notifier_call_chain - Call functions in an atomic notifier chain + * __atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See the comment for notifier_call_chain. + * @nr_calls: See the comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. @@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * of the last notifier function called. */ -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) +int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret; rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); +EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); + +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) +{ + return __atomic_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. @@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); /** - * blocking_notifier_call_chain - Call functions in a blocking notifier chain + * __blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); * of the last notifier function called. */ -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) +int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; @@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh, */ if (rcu_dereference(nh->head)) { down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, + nr_calls); up_read(&nh->rwsem); } return ret; } +EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) +{ + return __blocking_notifier_call_chain(nh, val, v, -1, NULL); +} EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* @@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); /** - * raw_notifier_call_chain - Call functions in a raw notifier chain + * __raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. @@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); * of the last notifier function called. */ +int __raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); +} + +EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); + int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { - return notifier_call_chain(&nh->head, val, v); + return __raw_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); @@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** - * srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); * of the last notifier function called. */ -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) +int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); srcu_read_unlock(&nh->srcu, idx); return ret; } +EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) +{ + return __srcu_notifier_call_chain(nh, val, v, -1, NULL); +} EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** @@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { - int ret = pm_suspend(PM_SUSPEND_DISK); + int ret = hibernate(); unlock_kernel(); return ret; } @@ -1292,7 +1352,7 @@ asmlinkage long sys_setfsuid(uid_t uid) } /* - * Samma på svenska.. + * Samma pÃ¥ svenska.. */ asmlinkage long sys_setfsgid(gid_t gid) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f0664bd..4073353 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -77,6 +77,7 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; +extern int sysctl_stat_interval; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -857,6 +858,17 @@ static ctl_table vm_table[] = { .extra2 = &one_hundred, }, #endif +#ifdef CONFIG_SMP + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, +#endif #if defined(CONFIG_X86_32) || \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fe5c7db..3db5c3c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -74,15 +74,17 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; +static int watchdog_resumed; + /* - * Interval: 0.5sec Treshold: 0.0625s + * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) static void clocksource_ratewd(struct clocksource *cs, int64_t delta) { - if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) + if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) return; printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", @@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data) struct clocksource *cs, *tmp; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; + int resumed; spin_lock(&watchdog_lock); + resumed = watchdog_resumed; + if (unlikely(resumed)) + watchdog_resumed = 0; + wdnow = watchdog->read(); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { csnow = cs->read(); + + if (unlikely(resumed)) { + cs->wd_last = csnow; + continue; + } + /* Initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && @@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data) } spin_unlock(&watchdog_lock); } +static void clocksource_resume_watchdog(void) +{ + spin_lock(&watchdog_lock); + watchdog_resumed = 1; + spin_unlock(&watchdog_lock); +} + static void clocksource_check_watchdog(struct clocksource *cs) { struct clocksource *cse; @@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } + +static inline void clocksource_resume_watchdog(void) { } #endif /** + * clocksource_resume - resume the clocksource(s) + */ +void clocksource_resume(void) +{ + struct list_head *tmp; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + + list_for_each(tmp, &clocksource_list) { + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (cs->resume) + cs->resume(); + } + + clocksource_resume_watchdog(); + + spin_unlock_irqrestore(&clocksource_lock, flags); +} + +/** * clocksource_get_next - Returns the selected clocksource * */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b734ca4..8bbcfb7 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", + SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n", (unsigned long long)ktime_to_ns(timer->expires), (unsigned long long)(ktime_to_ns(timer->expires) - now)); } @@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .index: %d\n", base->index); - SEQ_printf(m, " .resolution: %Ld nsecs\n", + SEQ_printf(m, " .resolution: %Lu nsecs\n", (unsigned long long)ktime_to_ns(base->resolution)); SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); SEQ_printf(m, "\n"); #ifdef CONFIG_HIGH_RES_TIMERS - SEQ_printf(m, " .offset: %Ld nsecs\n", - ktime_to_ns(base->offset)); + SEQ_printf(m, " .offset: %Lu nsecs\n", + (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now); @@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) print_base(m, cpu_base->clock_base + i, now); } #define P(x) \ - SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(cpu_base->x)) #define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ - (u64)(ktime_to_ns(cpu_base->x))) + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(cpu_base->x))) #ifdef CONFIG_HIGH_RES_TIMERS P_ns(expires_next); @@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_TICK_ONESHOT # define P(x) \ - SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(ts->x)) # define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ - (u64)(ktime_to_ns(ts->x))) + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(ts->x))) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); @@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(last_jiffies); P(next_jiffies); P_ns(idle_expires); - SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); + SEQ_printf(m, "jiffies: %Lu\n", + (unsigned long long)jiffies); } #endif diff --git a/kernel/timer.c b/kernel/timer.c index 7a64483..59a28b1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(tvec_base_t *base) { - return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); + return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG); } static inline tvec_base_t *tbase_get_base(tvec_base_t *base) { - return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); + return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG); } static inline void timer_set_deferrable(struct timer_list *timer) { - timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + timer->base = (tvec_base_t *)((unsigned long)timer->base | + TBASE_DEFERRABLE_FLAG); } static inline void timer_set_base(struct timer_list *timer, tvec_base_t *new_base) { - timer->base = (tvec_base_t *)((unsigned long)(new_base) | + timer->base = (tvec_base_t *)((unsigned long)new_base | tbase_get_deferrable(timer->base)); } @@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; switch(action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: if (init_timers_cpu(cpu) < 0) return NOTIFY_BAD; break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + case CPU_DEAD_FROZEN: migrate_timers(cpu); break; #endif @@ -1497,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti) prev = &curr->next; } + clocksource_resume(); + write_seqlock_irqsave(&xtime_lock, flags); if (ti == time_interpolator) { /* we lost the best time-interpolator: */ diff --git a/kernel/wait.c b/kernel/wait.c index 59a82f6..444ddbf 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue); * The spin_unlock() itself is semi-permeable and only protects * one way (it only protects stuff inside the critical region and * stops them from bleeding out - it would still allow subsequent - * loads to move into the the critical region). + * loads to move into the critical region). */ void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b6fa5e6..fb56fed 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -36,30 +36,20 @@ /* * The per-CPU workqueue (if single thread, we always use the first * possible cpu). - * - * The sequence counters are for flush_scheduled_work(). It wants to wait - * until all currently-scheduled works are completed, but it doesn't - * want to be livelocked by new, incoming ones. So it waits until - * remove_sequence is >= the insert_sequence which pertained when - * flush_scheduled_work() was called. */ struct cpu_workqueue_struct { spinlock_t lock; - long remove_sequence; /* Least-recently added (next to run) */ - long insert_sequence; /* Next to add */ - struct list_head worklist; wait_queue_head_t more_work; - wait_queue_head_t work_done; + struct work_struct *current_work; struct workqueue_struct *wq; struct task_struct *thread; + int should_stop; int run_depth; /* Detect run_workqueue() recursion depth */ - - int freezeable; /* Freeze the thread during suspend */ } ____cacheline_aligned; /* @@ -68,8 +58,10 @@ struct cpu_workqueue_struct { */ struct workqueue_struct { struct cpu_workqueue_struct *cpu_wq; + struct list_head list; const char *name; - struct list_head list; /* Empty if single thread */ + int singlethread; + int freezeable; /* Freeze threads during suspend */ }; /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove @@ -77,106 +69,68 @@ struct workqueue_struct { static DEFINE_MUTEX(workqueue_mutex); static LIST_HEAD(workqueues); -static int singlethread_cpu; +static int singlethread_cpu __read_mostly; +static cpumask_t cpu_singlethread_map __read_mostly; +/* optimization, we could use cpu_possible_map */ +static cpumask_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ static inline int is_single_threaded(struct workqueue_struct *wq) { - return list_empty(&wq->list); + return wq->singlethread; +} + +static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) +{ + return is_single_threaded(wq) + ? &cpu_singlethread_map : &cpu_populated_map; +} + +static +struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +{ + if (unlikely(is_single_threaded(wq))) + cpu = singlethread_cpu; + return per_cpu_ptr(wq->cpu_wq, cpu); } /* * Set the workqueue on which a work item is to be run * - Must *only* be called if the pending flag is set */ -static inline void set_wq_data(struct work_struct *work, void *wq) +static inline void set_wq_data(struct work_struct *work, + struct cpu_workqueue_struct *cwq) { unsigned long new; BUG_ON(!work_pending(work)); - new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); + new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); atomic_long_set(&work->data, new); } -static inline void *get_wq_data(struct work_struct *work) +static inline +struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) { return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } -static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) +static void insert_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work, int tail) { - int ret = 0; - unsigned long flags; - - spin_lock_irqsave(&cwq->lock, flags); + set_wq_data(work, cwq); /* - * We need to re-validate the work info after we've gotten - * the cpu_workqueue lock. We can run the work now iff: - * - * - the wq_data still matches the cpu_workqueue_struct - * - AND the work is still marked pending - * - AND the work is still on a list (which will be this - * workqueue_struct list) - * - * All these conditions are important, because we - * need to protect against the work being run right - * now on another CPU (all but the last one might be - * true if it's currently running and has not been - * released yet, for example). + * Ensure that we get the right work->data if we see the + * result of list_add() below, see try_to_grab_pending(). */ - if (get_wq_data(work) == cwq - && work_pending(work) - && !list_empty(&work->entry)) { - work_func_t f = work->func; - list_del_init(&work->entry); - spin_unlock_irqrestore(&cwq->lock, flags); - - if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) - work_release(work); - f(work); - - spin_lock_irqsave(&cwq->lock, flags); - cwq->remove_sequence++; - wake_up(&cwq->work_done); - ret = 1; - } - spin_unlock_irqrestore(&cwq->lock, flags); - return ret; -} - -/** - * run_scheduled_work - run scheduled work synchronously - * @work: work to run - * - * This checks if the work was pending, and runs it - * synchronously if so. It returns a boolean to indicate - * whether it had any scheduled work to run or not. - * - * NOTE! This _only_ works for normal work_structs. You - * CANNOT use this for delayed work, because the wq data - * for delayed work will not point properly to the per- - * CPU workqueue struct, but will change! - */ -int fastcall run_scheduled_work(struct work_struct *work) -{ - for (;;) { - struct cpu_workqueue_struct *cwq; - - if (!work_pending(work)) - return 0; - if (list_empty(&work->entry)) - return 0; - /* NOTE! This depends intimately on __queue_work! */ - cwq = get_wq_data(work); - if (!cwq) - return 0; - if (__run_work(cwq, work)) - return 1; - } + smp_wmb(); + if (tail) + list_add_tail(&work->entry, &cwq->worklist); + else + list_add(&work->entry, &cwq->worklist); + wake_up(&cwq->more_work); } -EXPORT_SYMBOL(run_scheduled_work); /* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, @@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, unsigned long flags; spin_lock_irqsave(&cwq->lock, flags); - set_wq_data(work, cwq); - list_add_tail(&work->entry, &cwq->worklist); - cwq->insert_sequence++; - wake_up(&cwq->more_work); + insert_work(cwq, work, 1); spin_unlock_irqrestore(&cwq->lock, flags); } @@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, */ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0, cpu = get_cpu(); + int ret = 0; if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - if (unlikely(is_single_threaded(wq))) - cpu = singlethread_cpu; BUG_ON(!list_empty(&work->entry)); - __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + __queue_work(wq_per_cpu(wq, get_cpu()), work); + put_cpu(); ret = 1; } - put_cpu(); return ret; } EXPORT_SYMBOL_GPL(queue_work); @@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work); void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct workqueue_struct *wq = get_wq_data(&dwork->work); - int cpu = smp_processor_id(); + struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); + struct workqueue_struct *wq = cwq->wq; - if (unlikely(is_single_threaded(wq))) - cpu = singlethread_cpu; - - __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work); + __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); } /** @@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data) int fastcall queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { - int ret = 0; - struct timer_list *timer = &dwork->timer; - struct work_struct *work = &dwork->work; - - timer_stats_timer_set_start_info(timer); + timer_stats_timer_set_start_info(&dwork->timer); if (delay == 0) - return queue_work(wq, work); - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + return queue_work(wq, &dwork->work); - /* This stores wq for the moment, for the timer_fn */ - set_wq_data(work, wq); - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; - add_timer(timer); - ret = 1; - } - return ret; + return queue_delayed_work_on(-1, wq, dwork, delay); } EXPORT_SYMBOL_GPL(queue_delayed_work); @@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); - /* This stores wq for the moment, for the timer_fn */ - set_wq_data(work, wq); + /* This stores cwq for the moment, for the timer_fn */ + set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; - add_timer_on(timer, cpu); + + if (unlikely(cpu >= 0)) + add_timer_on(timer, cpu); + else + add_timer(timer); ret = 1; } return ret; @@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); static void run_workqueue(struct cpu_workqueue_struct *cwq) { - unsigned long flags; - - /* - * Keep taking off work from the queue until - * done. - */ - spin_lock_irqsave(&cwq->lock, flags); + spin_lock_irq(&cwq->lock); cwq->run_depth++; if (cwq->run_depth > 3) { /* morton gets to eat his hat */ @@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct work_struct, entry); work_func_t f = work->func; + cwq->current_work = work; list_del_init(cwq->worklist.next); - spin_unlock_irqrestore(&cwq->lock, flags); + spin_unlock_irq(&cwq->lock); BUG_ON(get_wq_data(work) != cwq); - if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) - work_release(work); + work_clear_pending(work); f(work); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { @@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) dump_stack(); } - spin_lock_irqsave(&cwq->lock, flags); - cwq->remove_sequence++; - wake_up(&cwq->work_done); + spin_lock_irq(&cwq->lock); + cwq->current_work = NULL; } cwq->run_depth--; - spin_unlock_irqrestore(&cwq->lock, flags); + spin_unlock_irq(&cwq->lock); +} + +/* + * NOTE: the caller must not touch *cwq if this func returns true + */ +static int cwq_should_stop(struct cpu_workqueue_struct *cwq) +{ + int should_stop = cwq->should_stop; + + if (unlikely(should_stop)) { + spin_lock_irq(&cwq->lock); + should_stop = cwq->should_stop && list_empty(&cwq->worklist); + if (should_stop) + cwq->thread = NULL; + spin_unlock_irq(&cwq->lock); + } + + return should_stop; } static int worker_thread(void *__cwq) { struct cpu_workqueue_struct *cwq = __cwq; - DECLARE_WAITQUEUE(wait, current); - struct k_sigaction sa; - sigset_t blocked; + DEFINE_WAIT(wait); - if (!cwq->freezeable) + if (!cwq->wq->freezeable) current->flags |= PF_NOFREEZE; set_user_nice(current, -5); - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* - * We inherited MPOL_INTERLEAVE from the booting kernel. - * Set MPOL_DEFAULT to insure node local allocations. - */ - numa_default_policy(); - - /* SIG_IGN makes children autoreap: see do_notify_parent(). */ - sa.sa.sa_handler = SIG_IGN; - sa.sa.sa_flags = 0; - siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); - do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); + for (;;) { + prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); + if (!freezing(current) && !cwq->should_stop + && list_empty(&cwq->worklist)) + schedule(); + finish_wait(&cwq->more_work, &wait); - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - if (cwq->freezeable) - try_to_freeze(); + try_to_freeze(); - add_wait_queue(&cwq->more_work, &wait); - if (list_empty(&cwq->worklist)) - schedule(); - else - __set_current_state(TASK_RUNNING); - remove_wait_queue(&cwq->more_work, &wait); + if (cwq_should_stop(cwq)) + break; - if (!list_empty(&cwq->worklist)) - run_workqueue(cwq); - set_current_state(TASK_INTERRUPTIBLE); + run_workqueue(cwq); } - __set_current_state(TASK_RUNNING); + return 0; } +struct wq_barrier { + struct work_struct work; + struct completion done; +}; + +static void wq_barrier_func(struct work_struct *work) +{ + struct wq_barrier *barr = container_of(work, struct wq_barrier, work); + complete(&barr->done); +} + +static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, + struct wq_barrier *barr, int tail) +{ + INIT_WORK(&barr->work, wq_barrier_func); + __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); + + init_completion(&barr->done); + + insert_work(cwq, &barr->work, tail); +} + static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) { if (cwq->thread == current) { @@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) */ run_workqueue(cwq); } else { - DEFINE_WAIT(wait); - long sequence_needed; + struct wq_barrier barr; + int active = 0; spin_lock_irq(&cwq->lock); - sequence_needed = cwq->insert_sequence; - - while (sequence_needed - cwq->remove_sequence > 0) { - prepare_to_wait(&cwq->work_done, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&cwq->lock); - schedule(); - spin_lock_irq(&cwq->lock); + if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { + insert_wq_barrier(cwq, &barr, 1); + active = 1; } - finish_wait(&cwq->work_done, &wait); spin_unlock_irq(&cwq->lock); + + if (active) + wait_for_completion(&barr.done); } } @@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * Forces execution of the workqueue and blocks until its completion. * This is typically used in driver shutdown handlers. * - * This function will sample each workqueue's current insert_sequence number and - * will sleep until the head sequence is greater than or equal to that. This - * means that we sleep until all works which were queued on entry have been - * handled, but we are not livelocked by new incoming ones. + * We sleep until all works which were queued on entry have been handled, + * but we are not livelocked by new incoming ones. * * This function used to run the workqueues itself. Now we just wait for the * helper threads to do it. */ void fastcall flush_workqueue(struct workqueue_struct *wq) { + const cpumask_t *cpu_map = wq_cpu_map(wq); + int cpu; + might_sleep(); + for_each_cpu_mask(cpu, *cpu_map) + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); +} +EXPORT_SYMBOL_GPL(flush_workqueue); - if (is_single_threaded(wq)) { - /* Always use first cpu's area. */ - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); - } else { - int cpu; +/* + * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, + * so this work can't be re-armed in any way. + */ +static int try_to_grab_pending(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + int ret = 0; - mutex_lock(&workqueue_mutex); - for_each_online_cpu(cpu) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); - mutex_unlock(&workqueue_mutex); + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + return 1; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + + cwq = get_wq_data(work); + if (!cwq) + return ret; + + spin_lock_irq(&cwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong cwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (cwq == get_wq_data(work)) { + list_del_init(&work->entry); + ret = 1; + } } + spin_unlock_irq(&cwq->lock); + + return ret; } -EXPORT_SYMBOL_GPL(flush_workqueue); -static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, - int cpu, int freezeable) +static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) { - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - struct task_struct *p; + struct wq_barrier barr; + int running = 0; - spin_lock_init(&cwq->lock); - cwq->wq = wq; - cwq->thread = NULL; - cwq->insert_sequence = 0; - cwq->remove_sequence = 0; - cwq->freezeable = freezeable; - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); - init_waitqueue_head(&cwq->work_done); + spin_lock_irq(&cwq->lock); + if (unlikely(cwq->current_work == work)) { + insert_wq_barrier(cwq, &barr, 0); + running = 1; + } + spin_unlock_irq(&cwq->lock); - if (is_single_threaded(wq)) - p = kthread_create(worker_thread, cwq, "%s", wq->name); - else - p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); - if (IS_ERR(p)) - return NULL; - cwq->thread = p; - return p; + if (unlikely(running)) + wait_for_completion(&barr.done); } -struct workqueue_struct *__create_workqueue(const char *name, - int singlethread, int freezeable) +static void wait_on_work(struct work_struct *work) { - int cpu, destroy = 0; + struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - struct task_struct *p; + const cpumask_t *cpu_map; + int cpu; - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - return NULL; + might_sleep(); - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } + cwq = get_wq_data(work); + if (!cwq) + return; - wq->name = name; - mutex_lock(&workqueue_mutex); - if (singlethread) { - INIT_LIST_HEAD(&wq->list); - p = create_workqueue_thread(wq, singlethread_cpu, freezeable); - if (!p) - destroy = 1; - else - wake_up_process(p); - } else { - list_add(&wq->list, &workqueues); - for_each_online_cpu(cpu) { - p = create_workqueue_thread(wq, cpu, freezeable); - if (p) { - kthread_bind(p, cpu); - wake_up_process(p); - } else - destroy = 1; - } - } - mutex_unlock(&workqueue_mutex); + wq = cwq->wq; + cpu_map = wq_cpu_map(wq); - /* - * Was there any error during startup? If yes then clean up: - */ - if (destroy) { - destroy_workqueue(wq); - wq = NULL; - } - return wq; + for_each_cpu_mask(cpu, *cpu_map) + wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } -EXPORT_SYMBOL_GPL(__create_workqueue); -static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) +/** + * cancel_work_sync - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * cancel_work_sync() will cancel the work if it is queued. If the work's + * callback appears to be running, cancel_work_sync() will block until it + * has completed. + * + * It is possible to use this function if the work re-queues itself. It can + * cancel the work even if it migrates to another workqueue, however in that + * case it only guarantees that work->func() has completed on the last queued + * workqueue. + * + * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not + * pending, otherwise it goes into a busy-wait loop until the timer expires. + * + * The caller must ensure that workqueue_struct on which this work was last + * queued can't be destroyed before this function returns. + */ +void cancel_work_sync(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; - unsigned long flags; - struct task_struct *p; - - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - spin_lock_irqsave(&cwq->lock, flags); - p = cwq->thread; - cwq->thread = NULL; - spin_unlock_irqrestore(&cwq->lock, flags); - if (p) - kthread_stop(p); + while (!try_to_grab_pending(work)) + cpu_relax(); + wait_on_work(work); + work_clear_pending(work); } +EXPORT_SYMBOL_GPL(cancel_work_sync); /** - * destroy_workqueue - safely terminate a workqueue - * @wq: target workqueue + * cancel_rearming_delayed_work - reliably kill off a delayed work. + * @dwork: the delayed work struct * - * Safely destroy a workqueue. All work currently pending will be done first. + * It is possible to use this function if @dwork rearms itself via queue_work() + * or queue_delayed_work(). See also the comment for cancel_work_sync(). */ -void destroy_workqueue(struct workqueue_struct *wq) +void cancel_rearming_delayed_work(struct delayed_work *dwork) { - int cpu; - - flush_workqueue(wq); - - /* We don't need the distraction of CPUs appearing and vanishing. */ - mutex_lock(&workqueue_mutex); - if (is_single_threaded(wq)) - cleanup_workqueue_thread(wq, singlethread_cpu); - else { - for_each_online_cpu(cpu) - cleanup_workqueue_thread(wq, cpu); - list_del(&wq->list); - } - mutex_unlock(&workqueue_mutex); - free_percpu(wq->cpu_wq); - kfree(wq); + while (!del_timer(&dwork->timer) && + !try_to_grab_pending(&dwork->work)) + cpu_relax(); + wait_on_work(&dwork->work); + work_clear_pending(&dwork->work); } -EXPORT_SYMBOL_GPL(destroy_workqueue); +EXPORT_SYMBOL(cancel_rearming_delayed_work); -static struct workqueue_struct *keventd_wq; +static struct workqueue_struct *keventd_wq __read_mostly; /** * schedule_work - put work task in global workqueue @@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - mutex_lock(&workqueue_mutex); + preempt_disable(); /* CPU hotplug */ for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); @@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func) set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); } - mutex_unlock(&workqueue_mutex); + preempt_enable(); flush_workqueue(keventd_wq); free_percpu(works); return 0; @@ -659,29 +596,6 @@ void flush_scheduled_work(void) EXPORT_SYMBOL(flush_scheduled_work); /** - * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work. - * @wq: the controlling workqueue structure - * @dwork: the delayed work struct - */ -void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, - struct delayed_work *dwork) -{ - while (!cancel_delayed_work(dwork)) - flush_workqueue(wq); -} -EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); - -/** - * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work. - * @dwork: the delayed work struct - */ -void cancel_rearming_delayed_work(struct delayed_work *dwork) -{ - cancel_rearming_delayed_workqueue(keventd_wq, dwork); -} -EXPORT_SYMBOL(cancel_rearming_delayed_work); - -/** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must @@ -728,94 +642,209 @@ int current_is_keventd(void) } -/* Take the work from this (downed) CPU. */ -static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) +static struct cpu_workqueue_struct * +init_cpu_workqueue(struct workqueue_struct *wq, int cpu) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - struct list_head list; - struct work_struct *work; - spin_lock_irq(&cwq->lock); - list_replace_init(&cwq->worklist, &list); + cwq->wq = wq; + spin_lock_init(&cwq->lock); + INIT_LIST_HEAD(&cwq->worklist); + init_waitqueue_head(&cwq->more_work); + + return cwq; +} + +static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct workqueue_struct *wq = cwq->wq; + const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; + struct task_struct *p; + + p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); + /* + * Nobody can add the work_struct to this cwq, + * if (caller is __create_workqueue) + * nobody should see this wq + * else // caller is CPU_UP_PREPARE + * cpu is not on cpu_online_map + * so we can abort safely. + */ + if (IS_ERR(p)) + return PTR_ERR(p); + + cwq->thread = p; + cwq->should_stop = 0; + + return 0; +} + +static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct task_struct *p = cwq->thread; - while (!list_empty(&list)) { - printk("Taking work for %s\n", wq->name); - work = list_entry(list.next,struct work_struct,entry); - list_del(&work->entry); - __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); + if (p != NULL) { + if (cpu >= 0) + kthread_bind(p, cpu); + wake_up_process(p); } - spin_unlock_irq(&cwq->lock); } -/* We're holding the cpucontrol mutex here */ -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +struct workqueue_struct *__create_workqueue(const char *name, + int singlethread, int freezeable) { - unsigned int hotcpu = (unsigned long)hcpu; struct workqueue_struct *wq; + struct cpu_workqueue_struct *cwq; + int err = 0, cpu; - switch (action) { - case CPU_UP_PREPARE: - mutex_lock(&workqueue_mutex); - /* Create a new workqueue thread for it. */ - list_for_each_entry(wq, &workqueues, list) { - if (!create_workqueue_thread(wq, hotcpu, 0)) { - printk("workqueue for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - } - break; + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return NULL; - case CPU_ONLINE: - /* Kick off worker threads. */ - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq; + wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); + if (!wq->cpu_wq) { + kfree(wq); + return NULL; + } - cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); - kthread_bind(cwq->thread, hotcpu); - wake_up_process(cwq->thread); - } - mutex_unlock(&workqueue_mutex); - break; + wq->name = name; + wq->singlethread = singlethread; + wq->freezeable = freezeable; + INIT_LIST_HEAD(&wq->list); - case CPU_UP_CANCELED: - list_for_each_entry(wq, &workqueues, list) { - if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) + if (singlethread) { + cwq = init_cpu_workqueue(wq, singlethread_cpu); + err = create_workqueue_thread(cwq, singlethread_cpu); + start_workqueue_thread(cwq, -1); + } else { + mutex_lock(&workqueue_mutex); + list_add(&wq->list, &workqueues); + + for_each_possible_cpu(cpu) { + cwq = init_cpu_workqueue(wq, cpu); + if (err || !cpu_online(cpu)) continue; - /* Unbind so it can run. */ - kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, - any_online_cpu(cpu_online_map)); - cleanup_workqueue_thread(wq, hotcpu); + err = create_workqueue_thread(cwq, cpu); + start_workqueue_thread(cwq, cpu); } mutex_unlock(&workqueue_mutex); - break; + } + + if (err) { + destroy_workqueue(wq); + wq = NULL; + } + return wq; +} +EXPORT_SYMBOL_GPL(__create_workqueue); + +static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct wq_barrier barr; + int alive = 0; + + spin_lock_irq(&cwq->lock); + if (cwq->thread != NULL) { + insert_wq_barrier(cwq, &barr, 1); + cwq->should_stop = 1; + alive = 1; + } + spin_unlock_irq(&cwq->lock); + + if (alive) { + wait_for_completion(&barr.done); - case CPU_DOWN_PREPARE: + while (unlikely(cwq->thread != NULL)) + cpu_relax(); + /* + * Wait until cwq->thread unlocks cwq->lock, + * it won't touch *cwq after that. + */ + smp_rmb(); + spin_unlock_wait(&cwq->lock); + } +} + +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ +void destroy_workqueue(struct workqueue_struct *wq) +{ + const cpumask_t *cpu_map = wq_cpu_map(wq); + struct cpu_workqueue_struct *cwq; + int cpu; + + mutex_lock(&workqueue_mutex); + list_del(&wq->list); + mutex_unlock(&workqueue_mutex); + + for_each_cpu_mask(cpu, *cpu_map) { + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + cleanup_workqueue_thread(cwq, cpu); + } + + free_percpu(wq->cpu_wq); + kfree(wq); +} +EXPORT_SYMBOL_GPL(destroy_workqueue); + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct cpu_workqueue_struct *cwq; + struct workqueue_struct *wq; + + action &= ~CPU_TASKS_FROZEN; + + switch (action) { + case CPU_LOCK_ACQUIRE: mutex_lock(&workqueue_mutex); - break; + return NOTIFY_OK; - case CPU_DOWN_FAILED: + case CPU_LOCK_RELEASE: mutex_unlock(&workqueue_mutex); - break; + return NOTIFY_OK; - case CPU_DEAD: - list_for_each_entry(wq, &workqueues, list) - cleanup_workqueue_thread(wq, hotcpu); - list_for_each_entry(wq, &workqueues, list) - take_over_work(wq, hotcpu); - mutex_unlock(&workqueue_mutex); - break; + case CPU_UP_PREPARE: + cpu_set(cpu, cpu_populated_map); + } + + list_for_each_entry(wq, &workqueues, list) { + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + + switch (action) { + case CPU_UP_PREPARE: + if (!create_workqueue_thread(cwq, cpu)) + break; + printk(KERN_ERR "workqueue for %i failed\n", cpu); + return NOTIFY_BAD; + + case CPU_ONLINE: + start_workqueue_thread(cwq, cpu); + break; + + case CPU_UP_CANCELED: + start_workqueue_thread(cwq, -1); + case CPU_DEAD: + cleanup_workqueue_thread(cwq, cpu); + break; + } } return NOTIFY_OK; } -void init_workqueues(void) +void __init init_workqueues(void) { + cpu_populated_map = cpu_online_map; singlethread_cpu = first_cpu(cpu_possible_map); + cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); } - |