diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/futex.c | 201 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 4 | ||||
-rw-r--r-- | kernel/locking/mutex-debug.c | 7 | ||||
-rw-r--r-- | kernel/panic.c | 2 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 5 | ||||
-rw-r--r-- | kernel/rcu/srcu.c | 57 | ||||
-rw-r--r-- | kernel/rcu/torture.c | 75 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 97 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 12 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 102 | ||||
-rw-r--r-- | kernel/rcu/tree_trace.c | 3 | ||||
-rw-r--r-- | kernel/rcu/update.c | 5 | ||||
-rw-r--r-- | kernel/softirq.c | 49 |
14 files changed, 495 insertions, 126 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 5721f0e..dfa736c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1172,7 +1172,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * do not allow it to share a thread group or signal handlers or * parent with the forking task. */ - if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { + if (clone_flags & CLONE_SIGHAND) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) diff --git a/kernel/futex.c b/kernel/futex.c index f6ff019..1ddc449 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -63,14 +63,101 @@ #include <linux/sched/rt.h> #include <linux/hugetlb.h> #include <linux/freezer.h> +#include <linux/bootmem.h> #include <asm/futex.h> #include "locking/rtmutex_common.h" -int __read_mostly futex_cmpxchg_enabled; +/* + * Basic futex operation and ordering guarantees: + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * if (queue_empty()) + * return; + * if (uval == val) + * lock(hash_bucket(futex)); + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * + * waiters++; + * mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `-------> mb(); (B) + * if (uval == val) + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); if (waiters) + * lock(hash_bucket(futex)); + * wake_waiters(futex); + * unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read -- this + * is guaranteed by the head counter in the hb spinlock; and where (B) + * orders the write to futex and the waiters read -- this is done by the + * barriers in get_futex_key_refs(), through either ihold or atomic_inc, + * depending on the futex type. + * + * This yields the following case (where X:=waiters, Y:=futex): + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + */ -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) +int __read_mostly futex_cmpxchg_enabled; /* * Futex flags used to encode options to functions and preserve them across @@ -149,9 +236,41 @@ static const struct futex_q futex_q_init = { struct futex_hash_bucket { spinlock_t lock; struct plist_head chain; -}; +} ____cacheline_aligned_in_smp; -static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; +static unsigned long __read_mostly futex_hashsize; + +static struct futex_hash_bucket *futex_queues; + +static inline void futex_get_mm(union futex_key *key) +{ + atomic_inc(&key->private.mm->mm_count); + /* + * Ensure futex_get_mm() implies a full barrier such that + * get_futex_key() implies a full barrier. This is relied upon + * as full barrier (B), see the ordering comment above. + */ + smp_mb__after_atomic_inc(); +} + +static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + /* + * Tasks trying to enter the critical region are most likely + * potential waiters that will be added to the plist. Ensure + * that wakers won't miss to-be-slept tasks in the window between + * the wait call and the actual plist_add. + */ + if (spin_is_locked(&hb->lock)) + return true; + smp_rmb(); /* Make sure we check the lock state first */ + + return !plist_head_empty(&hb->chain); +#else + return true; +#endif +} /* * We hash on the keys returned from get_futex_key (see below). @@ -161,7 +280,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) u32 hash = jhash2((u32*)&key->both.word, (sizeof(key->both.word)+sizeof(key->both.ptr))/4, key->both.offset); - return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; + return &futex_queues[hash & (futex_hashsize - 1)]; } /* @@ -187,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); + ihold(key->shared.inode); /* implies MB (B) */ break; case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); + futex_get_mm(key); /* implies MB (B) */ break; } } @@ -264,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ return 0; } @@ -371,7 +490,7 @@ again: key->shared.pgoff = basepage_index(page); } - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ out: unlock_page(page_head); @@ -598,13 +717,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; - struct plist_head *head; struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* * Another waiter already exists - bump up @@ -986,7 +1102,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; int ret; @@ -998,10 +1113,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) goto out; hb = hash_futex(&key); + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) + goto out_put_key; + spin_lock(&hb->lock); - head = &hb->chain; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1019,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); +out_put_key: put_futex_key(&key); out: return ret; @@ -1034,7 +1154,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head; struct futex_q *this, *next; int ret, op_ret; @@ -1082,9 +1201,7 @@ retry_private: goto retry; } - head = &hb1->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (match_futex (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1097,10 +1214,8 @@ retry_private: } if (op_ret > 0) { - head = &hb2->chain; - op_ret = 0; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (match_futex (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1270,7 +1385,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, int drop_count = 0, task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head1; struct futex_q *this, *next; u32 curval2; @@ -1393,8 +1507,7 @@ retry_private: } } - head1 = &hb1->chain; - plist_for_each_entry_safe(this, next, head1, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (task_count - nr_wake >= nr_requeue) break; @@ -1489,12 +1602,12 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); + spin_lock(&hb->lock); /* implies MB (A) */ return hb; } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); @@ -1867,7 +1980,7 @@ retry_private: ret = get_futex_value_locked(&uval, uaddr); if (ret) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = get_user(uval, uaddr); if (ret) @@ -1881,7 +1994,7 @@ retry_private: } if (uval != val) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = -EWOULDBLOCK; } @@ -2029,7 +2142,7 @@ retry_private: * Task is exiting and we just wait for the * exit to complete. */ - queue_unlock(&q, hb); + queue_unlock(hb); put_futex_key(&q.key); cond_resched(); goto retry; @@ -2081,7 +2194,7 @@ retry_private: goto out_put_key; out_unlock_put_key: - queue_unlock(&q, hb); + queue_unlock(hb); out_put_key: put_futex_key(&q.key); @@ -2091,7 +2204,7 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - queue_unlock(&q, hb); + queue_unlock(hb); ret = fault_in_user_writeable(uaddr); if (ret) @@ -2113,7 +2226,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; u32 uval, vpid = task_pid_vnr(current); int ret; @@ -2153,9 +2265,7 @@ retry: * Ok, other tasks may need to be woken up - check waiters * and do the wakeup if necessary: */ - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (!match_futex (&this->key, &key)) continue; ret = wake_futex_pi(uaddr, uval, this); @@ -2734,8 +2844,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, static int __init futex_init(void) { u32 curval; - int i; + unsigned int futex_shift; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; /* * This will fail and we want it. Some arch implementations do * runtime detection of the futex_atomic_cmpxchg_inatomic() @@ -2749,7 +2872,7 @@ static int __init futex_init(void) if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + for (i = 0; i < futex_hashsize; i++) { plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 576ba75..eb8a547 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class) /* * Is this the address of a static object: */ +#ifdef __KERNEL__ static int static_obj(void *obj) { unsigned long start = (unsigned long) &_stext, @@ -616,6 +617,7 @@ static int static_obj(void *obj) */ return is_module_address(addr) || is_module_percpu_address(addr); } +#endif /* * To make lock name printouts unique, we calculate a unique @@ -4115,6 +4117,7 @@ void debug_check_no_locks_held(void) } EXPORT_SYMBOL_GPL(debug_check_no_locks_held); +#ifdef __KERNEL__ void debug_show_all_locks(void) { struct task_struct *g, *p; @@ -4172,6 +4175,7 @@ retry: read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); +#endif /* * Careful: only use this function if you are sure that diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443f..faf6f5b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -75,7 +75,12 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current); + + if (!lock->owner) + DEBUG_LOCKS_WARN_ON(!lock->owner); + else + DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/panic.c b/kernel/panic.c index c00b4ce..6d63003 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,7 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -int panic_timeout; +int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7859a0a..79c3877 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -96,19 +96,22 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -extern void kfree(const void *); +void kfree(const void *); static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; + rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); return 1; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); + rcu_lock_release(&rcu_callback_map); return 0; } } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 01d5ccb..3318d82 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -363,6 +363,29 @@ static void srcu_flip(struct srcu_struct *sp) /* * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing SRCU read-side critical section. On systems with + * more than one CPU, this means that when "func()" is invoked, each CPU + * is guaranteed to have executed a full memory barrier since the end of + * its last corresponding SRCU read-side critical section whose beginning + * preceded the call to call_rcu(). It also means that each CPU executing + * an SRCU read-side critical section that continues beyond the start of + * "func()" must have executed a memory barrier after the call_rcu() + * but before the beginning of that SRCU read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting SRCU callback function "func()", then both CPU A and CPU + * B are guaranteed to execute a full memory barrier during the time + * interval between the call to call_rcu() and the invocation of "func()". + * This guarantee applies even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Of course, these guarantees apply only for invocations of call_srcu(), + * srcu_read_lock(), and srcu_read_unlock() that are all passed the same + * srcu_struct structure. */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *head)) @@ -459,7 +482,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) * Note that it is illegal to call synchronize_srcu() from the corresponding * SRCU read-side critical section; doing so will result in deadlock. * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. + * srcu_struct from some other srcu_struct's read-side critical section, + * as long as the resulting graph of srcu_structs is acyclic. + * + * There are memory-ordering constraints implied by synchronize_srcu(). + * On systems with more than one CPU, when synchronize_srcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last corresponding SRCU-sched read-side critical section + * whose beginning preceded the call to synchronize_srcu(). In addition, + * each CPU having an SRCU read-side critical section that extends beyond + * the return from synchronize_srcu() is guaranteed to have executed a + * full memory barrier after the beginning of synchronize_srcu() and before + * the beginning of that SRCU read-side critical section. Note that these + * guarantees include CPUs that are offline, idle, or executing in user mode, + * as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_srcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_srcu(). This guarantee applies even if CPU A and CPU B + * are the same CPU, but again only if the system has more than one CPU. + * + * Of course, these memory-ordering guarantees apply only when + * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are + * passed the same srcu_struct structure. */ void synchronize_srcu(struct srcu_struct *sp) { @@ -476,12 +522,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. * - * Note that it is also illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; - * doing so will result in deadlock. However, it is perfectly legal - * to call synchronize_srcu_expedited() on one srcu_struct from some - * other srcu_struct's read-side critical section, as long as - * the resulting graph of srcu_structs is acyclic. + * Note that synchronize_srcu_expedited() has the same deadlock and + * memory-ordering properties as does synchronize_srcu(). */ void synchronize_srcu_expedited(struct srcu_struct *sp) { @@ -491,6 +533,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + * @sp: srcu_struct on which to wait for in-flight callbacks. */ void srcu_barrier(struct srcu_struct *sp) { diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c index 3929cd4..732f8ae 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/torture.c @@ -139,8 +139,6 @@ MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); #define VERBOSE_PRINTK_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) -static char printk_buf[4096]; - static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; @@ -376,7 +374,7 @@ struct rcu_torture_ops { void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); - int (*stats)(char *page); + void (*stats)(char *page); int irq_capable; int can_boost; const char *name; @@ -578,21 +576,19 @@ static void srcu_torture_barrier(void) srcu_barrier(&srcu_ctl); } -static int srcu_torture_stats(char *page) +static void srcu_torture_stats(char *page) { - int cnt = 0; int cpu; int idx = srcu_ctl.completed & 0x1; - cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + page += sprintf(page, "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, + page += sprintf(page, " %d(%lu,%lu)", cpu, per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); } - cnt += sprintf(&page[cnt], "\n"); - return cnt; + sprintf(page, "\n"); } static void srcu_torture_synchronize_expedited(void) @@ -1052,10 +1048,9 @@ rcu_torture_reader(void *arg) /* * Create an RCU-torture statistics message in the specified buffer. */ -static int +static void rcu_torture_printk(char *page) { - int cnt = 0; int cpu; int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; @@ -1071,8 +1066,8 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, rcu_torture_current_version, @@ -1080,53 +1075,52 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", + page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); - cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", + page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - cnt += sprintf(&page[cnt], + page += sprintf(page, "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", n_online_successes, n_online_attempts, n_offline_successes, n_offline_attempts, min_online, max_online, min_offline, max_offline, sum_online, sum_offline, HZ); - cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", + page += sprintf(page, "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || n_rcu_torture_boost_failure != 0 || i > 1) { - cnt += sprintf(&page[cnt], "!!! "); + page += sprintf(page, "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); } - cnt += sprintf(&page[cnt], "Reader Pipe: "); + page += sprintf(page, "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Reader Batch: "); + page += sprintf(page, " %ld", pipesummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Reader Batch: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + page += sprintf(page, " %ld", batchsummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - cnt += sprintf(&page[cnt], " %d", + page += sprintf(page, " %d", atomic_read(&rcu_torture_wcount[i])); } - cnt += sprintf(&page[cnt], "\n"); + page += sprintf(page, "\n"); if (cur_ops->stats) - cnt += cur_ops->stats(&page[cnt]); - return cnt; + cur_ops->stats(page); } /* @@ -1140,10 +1134,17 @@ rcu_torture_printk(char *page) static void rcu_torture_stats_print(void) { - int cnt; + int size = nr_cpu_ids * 200 + 8192; + char *buf; - cnt = rcu_torture_printk(printk_buf); - pr_alert("%s", printk_buf); + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("rcu-torture: Out of memory, need: %d", size); + return; + } + rcu_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); } /* @@ -1578,6 +1579,7 @@ static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; bool lastphase = 0; + bool newphase; struct rcu_head rcu; init_rcu_head_on_stack(&rcu); @@ -1585,10 +1587,11 @@ static int rcu_torture_barrier_cbs(void *arg) set_user_nice(current, 19); do { wait_event(barrier_cbs_wq[myid], - barrier_phase != lastphase || + (newphase = + ACCESS_ONCE(barrier_phase)) != lastphase || kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP); - lastphase = barrier_phase; + lastphase = newphase; smp_mb(); /* ensure barrier_phase load before ->call(). */ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; @@ -1625,7 +1628,7 @@ static int rcu_torture_barrier(void *arg) if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; n_barrier_attempts++; - cur_ops->cb_barrier(); + cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; WARN_ON_ONCE(1); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd08198..b3d116c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { + struct rcu_state *rsp; + struct rcu_data *rdp; + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + do_nocb_deferred_wakeup(rdp); + } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ @@ -411,11 +418,12 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - else + rcu_eqs_enter_common(rdtp, oldval, user); + } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_eqs_enter_common(rdtp, oldval, user); + } } /** @@ -533,11 +541,12 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) + if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else + } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(rdtp, oldval, user); + } } /** @@ -716,7 +725,7 @@ bool rcu_lockdep_current_cpu_online(void) bool ret; if (in_nmi()) - return 1; + return true; preempt_disable(); rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; @@ -755,6 +764,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, } /* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + +/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -812,16 +827,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, */ rcu_kick_nohz_cpu(rdp->cpu); + /* + * Alternatively, the CPU might be running in the kernel + * for an extended period of time without a quiescent state. + * Attempt to force the CPU through the scheduler to gain the + * needed quiescent state, but only if the grace period has gone + * on for an uncommonly long time. If there are many stuck CPUs, + * we will beat on the first one until it gets unstuck, then move + * to the next. Only do this for the primary flavor of RCU. + */ + if (rdp->rsp == rcu_state && + ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { + rdp->rsp->jiffies_resched += 5; + resched_cpu(rdp->cpu); + } + return 0; } static void record_gp_stall_check_time(struct rcu_state *rsp) { unsigned long j = ACCESS_ONCE(jiffies); + unsigned long j1; rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ - rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); + j1 = rcu_jiffies_till_stall_check(); + rsp->jiffies_stall = j + j1; + rsp->jiffies_resched = j + j1 / 2; } /* @@ -1133,8 +1166,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) + if (rnp != rnp_root) { raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); + } /* * Get a new grace-period number. If there really is no grace @@ -1354,6 +1389,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } + smp_mb__after_unlock_lock(); __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1368,6 +1404,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); if (rsp->gp_flags == 0) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1409,6 +1446,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1463,6 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rsp->gp_flags &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } @@ -1480,6 +1519,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1505,16 +1545,19 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) __note_gp_changes(rsp, rnp, rdp); + /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ @@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; cond_resched(); @@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg) (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); + /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) @@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); WARN_ON_ONCE(rnp_c->qsmask); } @@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum) { @@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Adopt the RCU callbacks from the specified rcu_state structure's * orphanage. The caller must hold the ->orphan_lock. */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ @@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp); + rcu_adopt_orphan_cbs(rsp, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) @@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp, cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (!rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rnp = rcu_get_root(rsp); if (rnp->qsmask == 0) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ } } @@ -2263,6 +2313,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave(&rnp_old->lock, flags); + smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* If there are callbacks ready, invoke them. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_callbacks(rsp, rdp); + + /* Do any needed deferred wakeups of rcuo kthreads. */ + do_nocb_deferred_wakeup(rdp); } /* @@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_node *rnp_root = rcu_get_root(rsp); raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); } else { @@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy); + offline = !__call_rcu_nocb(rdp, head, lazy, flags); WARN_ON_ONCE(offline); /* _call_rcu() is illegal on offline CPU; leak the callback. */ local_irq_restore(flags); @@ -2757,6 +2812,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); + /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ + if (rcu_nohz_full_cpu(rsp)) + return 0; + /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { @@ -2790,6 +2849,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp)) { + rdp->n_rp_nocb_defer_wakeup++; + return 1; + } + /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -3214,9 +3279,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = rcu_num_lvls - 1; i > 0; i--) + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -3346,6 +3411,8 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + rcu_fanout_leaf, nr_cpu_ids); /* * Compute number of nodes that can be handled an rcu_node tree diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 52be957..8c19873 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -317,6 +317,7 @@ struct rcu_data { unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; unsigned long n_rp_gp_started; + unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; /* 6) _rcu_barrier() and OOM callbacks. */ @@ -335,6 +336,7 @@ struct rcu_data { int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ @@ -453,6 +455,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + unsigned long jiffies_resched; /* Time at which to resched */ + /* a reluctant CPU. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ @@ -548,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy); + bool lazy, unsigned long flags); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp); + struct rcu_data *rdp, + unsigned long flags); +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); +static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); @@ -564,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj); static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); +static bool rcu_nohz_full_cpu(struct rcu_state *rsp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 08a7652..6e2ef4b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -204,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu) rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -312,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); } @@ -361,10 +363,14 @@ void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { rcu_preempt_qs(smp_processor_id()); + if (!t->rcu_read_unlock_special) { + local_irq_restore(flags); + return; + } } - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { + /* Hardware IRQ handlers cannot block, complain if they get here. */ + if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { local_irq_restore(flags); return; } @@ -381,6 +387,7 @@ void rcu_read_unlock_special(struct task_struct *t) for (;;) { rnp = t->rcu_blocked_node; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); if (rnp == t->rcu_blocked_node) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -605,6 +612,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, while (!list_empty(lp)) { t = list_entry(lp->next, typeof(*t), rcu_node_entry); raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); list_del(&t->rcu_node_entry); t->rcu_blocked_node = rnp_root; list_add(&t->rcu_node_entry, lp_root); @@ -629,6 +637,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * in this case. */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); if (rnp_root->boost_tasks != NULL && rnp_root->boost_tasks != rnp_root->gp_tasks && rnp_root->boost_tasks != rnp_root->exp_tasks) @@ -772,6 +781,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -779,14 +789,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ wake_up(&sync_rcu_preempt_exp_wq); + } break; } mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); rnp->expmask &= ~mask; } } @@ -806,6 +819,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { @@ -886,6 +900,7 @@ void synchronize_rcu_expedited(void) /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->expmask = rnp->qsmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1191,6 +1206,7 @@ static int rcu_boost(struct rcu_node *rnp) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* * Recheck under the lock: all tasks in need of boosting @@ -1377,6 +1393,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_BOOST_PRIO; @@ -1769,6 +1786,7 @@ static void rcu_prepare_for_idle(int cpu) continue; rnp = rdp->mynode; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -1852,6 +1870,7 @@ static int rcu_oom_notify(struct notifier_block *self, /* Wait for callbacks from earlier instance to complete. */ wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + smp_mb(); /* Ensure callback reuse happens after callback invocation. */ /* * Prevent premature wakeup: ensure that all increments happen @@ -2101,7 +2120,8 @@ bool rcu_is_nocb_cpu(int cpu) static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct rcu_head *rhp, struct rcu_head **rhtp, - int rhcount, int rhcount_lazy) + int rhcount, int rhcount_lazy, + unsigned long flags) { int len; struct rcu_head **old_rhpp; @@ -2122,9 +2142,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { - wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + if (!irqs_disabled_flags(flags)) { + wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rdp->nocb_defer_wakeup = true; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmptyIsDeferred")); + } rdp->qlen_last_fqs_check = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else if (len > rdp->qlen_last_fqs_check + qhimark) { wake_up_process(t); /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = LONG_MAX / 2; @@ -2145,12 +2172,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, * "rcuo" kthread can find it. */ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, (unsigned long)rhp->func, @@ -2168,7 +2195,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * not a no-CBs CPU. */ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { long ql = rsp->qlen; long qll = rsp->qlen_lazy; @@ -2182,14 +2210,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, /* First, enqueue the donelist, if any. This preserves CB ordering. */ if (rsp->orphan_donelist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll); + rsp->orphan_donetail, ql, qll, flags); ql = qll = 0; rsp->orphan_donelist = NULL; rsp->orphan_donetail = &rsp->orphan_donelist; } if (rsp->orphan_nxtlist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll); + rsp->orphan_nxttail, ql, qll, flags); ql = qll = 0; rsp->orphan_nxtlist = NULL; rsp->orphan_nxttail = &rsp->orphan_nxtlist; @@ -2209,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); c = rcu_start_future_gp(rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -2250,6 +2279,7 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("Sleep")); wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); + /* Memory barrier provide by xchg() below. */ } else if (firsttime) { firsttime = 0; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -2310,6 +2340,22 @@ static int rcu_nocb_kthread(void *arg) return 0; } +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return ACCESS_ONCE(rdp->nocb_defer_wakeup); +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (!rcu_nocb_need_deferred_wakeup(rdp)) + return; + ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; + wake_up(&rdp->nocb_wq); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); +} + /* Initialize per-rcu_data variables for no-CBs CPUs. */ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { @@ -2365,13 +2411,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) } static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { return 0; } static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { return 0; } @@ -2380,6 +2427,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ +} + static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } @@ -2829,3 +2885,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) } #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +/* + * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the + * grace-period kthread will do force_quiescent_state() processing? + * The idea is to avoid waking up RCU core processing on such a + * CPU unless the grace period has extended for too long. + * + * This code relies on the fact that all NO_HZ_FULL CPUs are also + * CONFIG_RCU_NOCB_CPUs. + */ +static bool rcu_nohz_full_cpu(struct rcu_state *rsp) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(smp_processor_id()) && + (!rcu_gp_in_progress(rsp) || + ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) + return 1; +#endif /* #ifdef CONFIG_NO_HZ_FULL */ + return 0; +} diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3596797..4def475 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", + seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, + rdp->n_rp_nocb_defer_wakeup, rdp->n_rp_need_nothing); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6cb3dff..802365c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -128,6 +128,11 @@ struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +static struct lock_class_key rcu_callback_key; +struct lockdep_map rcu_callback_map = + STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); +EXPORT_SYMBOL_GPL(rcu_callback_map); + int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active && debug_locks && diff --git a/kernel/softirq.c b/kernel/softirq.c index 11025cc..9a4500e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -211,14 +211,48 @@ EXPORT_SYMBOL(local_bh_enable_ip); #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) #define MAX_SOFTIRQ_RESTART 10 +#ifdef CONFIG_TRACE_IRQFLAGS +/* + * When we run softirqs from irq_exit() and thus on the hardirq stack we need + * to keep the lockdep irq context tracking as tight as possible in order to + * not miss-qualify lock contexts and miss possible deadlocks. + */ + +static inline bool lockdep_softirq_start(void) +{ + bool in_hardirq = false; + + if (trace_hardirq_context(current)) { + in_hardirq = true; + trace_hardirq_exit(); + } + + lockdep_softirq_enter(); + + return in_hardirq; +} + +static inline void lockdep_softirq_end(bool in_hardirq) +{ + lockdep_softirq_exit(); + + if (in_hardirq) + trace_hardirq_enter(); +} +#else +static inline bool lockdep_softirq_start(void) { return false; } +static inline void lockdep_softirq_end(bool in_hardirq) { } +#endif + asmlinkage void __do_softirq(void) { - struct softirq_action *h; - __u32 pending; unsigned long end = jiffies + MAX_SOFTIRQ_TIME; - int cpu; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; + struct softirq_action *h; + bool in_hardirq; + __u32 pending; + int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -231,7 +265,7 @@ asmlinkage void __do_softirq(void) account_irq_enter_time(current); __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); - lockdep_softirq_enter(); + in_hardirq = lockdep_softirq_start(); cpu = smp_processor_id(); restart: @@ -278,16 +312,13 @@ restart: wakeup_softirqd(); } - lockdep_softirq_exit(); - + lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } - - asmlinkage void do_softirq(void) { __u32 pending; @@ -375,13 +406,13 @@ void irq_exit(void) #endif account_irq_exit_time(current); - trace_hardirq_exit(); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); tick_irq_exit(); rcu_irq_exit(); + trace_hardirq_exit(); /* must be last! */ } /* |