diff options
-rw-r--r-- | Documentation/kernel-parameters.txt | 88 | ||||
-rw-r--r-- | arch/um/drivers/mconsole_kern.c | 1 | ||||
-rw-r--r-- | include/linux/rculist.h | 40 | ||||
-rw-r--r-- | include/linux/rcupdate.h | 20 | ||||
-rw-r--r-- | include/linux/rcutiny.h | 11 | ||||
-rw-r--r-- | include/linux/rcutree.h | 19 | ||||
-rw-r--r-- | include/linux/sched.h | 10 | ||||
-rw-r--r-- | include/trace/events/rcu.h | 2 | ||||
-rw-r--r-- | init/Kconfig | 50 | ||||
-rw-r--r-- | kernel/rcupdate.c | 28 | ||||
-rw-r--r-- | kernel/rcutiny_plugin.h | 16 | ||||
-rw-r--r-- | kernel/rcutree.c | 332 | ||||
-rw-r--r-- | kernel/rcutree.h | 23 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 154 | ||||
-rw-r--r-- | kernel/rcutree_trace.c | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/timer.c | 8 | ||||
-rw-r--r-- | lib/list_debug.c | 22 |
18 files changed, 617 insertions, 212 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c1601e5..ab84a01 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2330,18 +2330,100 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes See Documentation/blockdev/ramdisk.txt. - rcupdate.blimit= [KNL,BOOT] + rcutree.blimit= [KNL,BOOT] Set maximum number of finished RCU callbacks to process in one batch. - rcupdate.qhimark= [KNL,BOOT] + rcutree.qhimark= [KNL,BOOT] Set threshold of queued RCU callbacks over which batch limiting is disabled. - rcupdate.qlowmark= [KNL,BOOT] + rcutree.qlowmark= [KNL,BOOT] Set threshold of queued RCU callbacks below which batch limiting is re-enabled. + rcutree.rcu_cpu_stall_suppress= [KNL,BOOT] + Suppress RCU CPU stall warning messages. + + rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] + Set timeout for RCU CPU stall warning messages. + + rcutorture.fqs_duration= [KNL,BOOT] + Set duration of force_quiescent_state bursts. + + rcutorture.fqs_holdoff= [KNL,BOOT] + Set holdoff time within force_quiescent_state bursts. + + rcutorture.fqs_stutter= [KNL,BOOT] + Set wait time between force_quiescent_state bursts. + + rcutorture.irqreader= [KNL,BOOT] + Test RCU readers from irq handlers. + + rcutorture.n_barrier_cbs= [KNL,BOOT] + Set callbacks/threads for rcu_barrier() testing. + + rcutorture.nfakewriters= [KNL,BOOT] + Set number of concurrent RCU writers. These just + stress RCU, they don't participate in the actual + test, hence the "fake". + + rcutorture.nreaders= [KNL,BOOT] + Set number of RCU readers. + + rcutorture.onoff_holdoff= [KNL,BOOT] + Set time (s) after boot for CPU-hotplug testing. + + rcutorture.onoff_interval= [KNL,BOOT] + Set time (s) between CPU-hotplug operations, or + zero to disable CPU-hotplug testing. + + rcutorture.shuffle_interval= [KNL,BOOT] + Set task-shuffle interval (s). Shuffling tasks + allows some CPUs to go into dyntick-idle mode + during the rcutorture test. + + rcutorture.shutdown_secs= [KNL,BOOT] + Set time (s) after boot system shutdown. This + is useful for hands-off automated testing. + + rcutorture.stall_cpu= [KNL,BOOT] + Duration of CPU stall (s) to test RCU CPU stall + warnings, zero to disable. + + rcutorture.stall_cpu_holdoff= [KNL,BOOT] + Time to wait (s) after boot before inducing stall. + + rcutorture.stat_interval= [KNL,BOOT] + Time (s) between statistics printk()s. + + rcutorture.stutter= [KNL,BOOT] + Time (s) to stutter testing, for example, specifying + five seconds causes the test to run for five seconds, + wait for five seconds, and so on. This tests RCU's + ability to transition abruptly to and from idle. + + rcutorture.test_boost= [KNL,BOOT] + Test RCU priority boosting? 0=no, 1=maybe, 2=yes. + "Maybe" means test if the RCU implementation + under test support RCU priority boosting. + + rcutorture.test_boost_duration= [KNL,BOOT] + Duration (s) of each individual boost test. + + rcutorture.test_boost_interval= [KNL,BOOT] + Interval (s) between each boost test. + + rcutorture.test_no_idle_hz= [KNL,BOOT] + Test RCU's dyntick-idle handling. See also the + rcutorture.shuffle_interval parameter. + + rcutorture.torture_type= [KNL,BOOT] + Specify the RCU implementation to test. + + rcutorture.verbose= [KNL,BOOT] + Enable additional printk() statements. + rdinit= [KNL] Format: <full_path> Run specified binary instead of /init from the ramdisk, diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 43b39d6..88e466b 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -705,6 +705,7 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; + rcu_switch_from(from); switch_to(from, to, from); } diff --git a/include/linux/rculist.h b/include/linux/rculist.h index d079290..e0f0fab 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -30,6 +30,7 @@ * This is only for internal list manipulation where we know * the prev/next entries already! */ +#ifndef CONFIG_DEBUG_LIST static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { @@ -38,6 +39,10 @@ static inline void __list_add_rcu(struct list_head *new, rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } +#else +extern void __list_add_rcu(struct list_head *new, + struct list_head *prev, struct list_head *next); +#endif /** * list_add_rcu - add a new entry to rcu-protected list @@ -108,7 +113,7 @@ static inline void list_add_tail_rcu(struct list_head *new, */ static inline void list_del_rcu(struct list_head *entry) { - __list_del(entry->prev, entry->next); + __list_del_entry(entry); entry->prev = LIST_POISON2; } @@ -228,18 +233,43 @@ static inline void list_splice_init_rcu(struct list_head *list, }) /** - * list_first_entry_rcu - get the first element from a list + * Where are list_empty_rcu() and list_first_entry_rcu()? + * + * Implementing those functions following their counterparts list_empty() and + * list_first_entry() is not advisable because they lead to subtle race + * conditions as the following snippet shows: + * + * if (!list_empty_rcu(mylist)) { + * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); + * do_something(bar); + * } + * + * The list may not be empty when list_empty_rcu checks it, but it may be when + * list_first_entry_rcu rereads the ->next pointer. + * + * Rereading the ->next pointer is not a problem for list_empty() and + * list_first_entry() because they would be protected by a lock that blocks + * writers. + * + * See list_first_or_null_rcu for an alternative. + */ + +/** + * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. * - * Note, that list is expected to be not empty. + * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ -#define list_first_entry_rcu(ptr, type, member) \ - list_entry_rcu((ptr)->next, type, member) +#define list_first_or_null_rcu(ptr, type, member) \ + ({struct list_head *__ptr = (ptr); \ + struct list_head __rcu *__next = list_next_rcu(__ptr); \ + likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \ + }) /** * list_for_each_entry_rcu - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 20fb776..26d1a47 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,12 +184,14 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); +extern void rcu_preempt_note_context_switch(void); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; extern void rcu_idle_enter(void); extern void rcu_idle_exit(void); extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); +extern void exit_rcu(void); /** * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers @@ -922,6 +924,21 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset) kfree_call_rcu(head, (rcu_callback)offset); } +/* + * Does the specified offset indicate that the corresponding rcu_head + * structure can be handled by kfree_rcu()? + */ +#define __is_kfree_rcu_offset(offset) ((offset) < 4096) + +/* + * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. + */ +#define __kfree_rcu(head, offset) \ + do { \ + BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ + call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \ + } while (0) + /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree @@ -944,6 +961,9 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset) * * Note that the allowable offset might decrease in the future, for example, * to allow something like kmem_cache_free_rcu(). + * + * The BUILD_BUG_ON check must not involve any function calls, hence the + * checks are done in macros here. */ #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index e93df77..adb5e5a 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,14 +87,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU -static inline void rcu_preempt_note_context_switch(void) -{ -} - -static inline void exit_rcu(void) -{ -} - static inline int rcu_needs_cpu(int cpu) { return 0; @@ -102,8 +94,6 @@ static inline int rcu_needs_cpu(int cpu) #else /* #ifdef CONFIG_TINY_RCU */ -void rcu_preempt_note_context_switch(void); -extern void exit_rcu(void); int rcu_preempt_needs_cpu(void); static inline int rcu_needs_cpu(int cpu) @@ -116,7 +106,6 @@ static inline int rcu_needs_cpu(int cpu) static inline void rcu_note_context_switch(int cpu) { rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(); } /* diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index e8ee5dd..3c6083c 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -45,18 +45,6 @@ static inline void rcu_virt_note_context_switch(int cpu) rcu_note_context_switch(cpu); } -#ifdef CONFIG_TREE_PREEMPT_RCU - -extern void exit_rcu(void); - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -static inline void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - extern void synchronize_rcu_bh(void); extern void synchronize_sched_expedited(void); extern void synchronize_rcu_expedited(void); @@ -98,13 +86,6 @@ extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); -/* A context switch is a grace period for RCU-sched and RCU-bh. */ -static inline int rcu_blocking_is_gp(void) -{ - might_sleep(); /* Check for RCU read-side critical section. */ - return num_online_cpus() == 1; -} - extern void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; diff --git a/include/linux/sched.h b/include/linux/sched.h index 81a173c..8f3fd94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1905,12 +1905,22 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_node_entry); } +static inline void rcu_switch_from(struct task_struct *prev) +{ + if (prev->rcu_read_lock_nesting != 0) + rcu_preempt_note_context_switch(); +} + #else static inline void rcu_copy_process(struct task_struct *p) { } +static inline void rcu_switch_from(struct task_struct *prev) +{ +} + #endif #ifdef CONFIG_SMP diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 3370997..1480900 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -292,6 +292,8 @@ TRACE_EVENT(rcu_dyntick, * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! * "Timer": Timer fired to cause CPU to continue processing callbacks. + * "Demigrate": Timer fired on wrong CPU, woke up correct CPU. + * "Cleanup after idle": Idle exited, timer canceled. */ TRACE_EVENT(rcu_prep_idle, diff --git a/init/Kconfig b/init/Kconfig index 6cfd71d..6d18ef8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -458,6 +458,33 @@ config RCU_FANOUT Select a specific number if testing RCU itself. Take the default if unsure. +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 RCU_FANOUT if 64BIT + range 2 RCU_FANOUT if !64BIT + depends on TREE_RCU || TREE_PREEMPT_RCU + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems. + + Take the default if unsure. + config RCU_FANOUT_EXACT bool "Disable tree-based hierarchical RCU auto-balancing" depends on TREE_RCU || TREE_PREEMPT_RCU @@ -515,10 +542,25 @@ config RCU_BOOST_PRIO depends on RCU_BOOST default 1 help - This option specifies the real-time priority to which preempted - RCU readers are to be boosted. If you are working with CPU-bound - real-time applications, you should specify a priority higher then - the highest-priority CPU-bound application. + This option specifies the real-time priority to which long-term + preempted RCU readers are to be boosted. If you are working + with a real-time application that has one or more CPU-bound + threads running at a real-time priority level, you should set + RCU_BOOST_PRIO to a priority higher then the highest-priority + real-time CPU-bound thread. The default RCU_BOOST_PRIO value + of 1 is appropriate in the common case, which is real-time + applications that do not have any CPU-bound threads. + + Some real-time applications might not have a single real-time + thread that saturates a given CPU, but instead might have + multiple real-time threads that, taken together, fully utilize + that CPU. In this case, you should set RCU_BOOST_PRIO to + a priority higher than the lowest-priority thread that is + conspiring to prevent the CPU from running any non-real-time + tasks. For example, if one thread at priority 10 and another + thread at priority 5 are between themselves fully consuming + the CPU time on a given CPU, then RCU_BOOST_PRIO should be + set to priority 6 or higher. Specify the real-time priority, or take the default if unsure. diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f174..95cba41 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -51,6 +51,34 @@ #include "rcu.h" +#ifdef CONFIG_PREEMPT_RCU + +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +void exit_rcu(void) +{ +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0..fc31a2d 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; } -/* - * Check for a task exiting while in a preemptible -RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting == 0) - return; - t->rcu_read_lock_nesting = 1; - __rcu_read_unlock(); -} - #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d..b3ea3ac 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ + .orphan_nxttail = &structname##_state.orphan_nxtlist, \ + .orphan_donetail = &structname##_state.orphan_donelist, \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; +/* State information for rcu_barrier() and friends. */ + +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* - * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Also record a quiescent state for this CPU for the current grace period. - * Synchronization and interrupt disabling are not required because - * this function executes in stop_machine() context. Therefore, cleanup - * operations that might block must be done later from the CPU_DEAD - * notifier. - * - * Note that the outgoing CPU's bit has already been cleared in the - * cpu_online_mask. This allows us to randomly pick a callback - * destination from the bits set in that mask. + * Send the specified CPU's RCU callbacks to the orphanage. The + * specified CPU must be offline, and the caller must hold the + * ->onofflock. */ -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +static void +rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, + struct rcu_node *rnp, struct rcu_data *rdp) { int i; - unsigned long mask; - int receive_cpu = cpumask_any(cpu_online_mask); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ - /* First, adjust the counts. */ + /* + * Orphan the callbacks. First adjust the counts. This is safe + * because ->onofflock excludes _rcu_barrier()'s adoption of + * the callbacks, thus no memory barrier is required. + */ if (rdp->nxtlist != NULL) { - receive_rdp->qlen_lazy += rdp->qlen_lazy; - receive_rdp->qlen += rdp->qlen; + rsp->qlen_lazy += rdp->qlen_lazy; + rsp->qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; rdp->qlen = 0; } /* - * Next, move ready-to-invoke callbacks to be invoked on some - * other CPU. These will not be required to pass through another - * grace period: They are done, regardless of CPU. + * Next, move those callbacks still needing a grace period to + * the orphanage, where some other CPU will pick them up. + * Some of the callbacks might have gone partway through a grace + * period, but that is too bad. They get to start over because we + * cannot assume that grace periods are synchronized across CPUs. + * We don't bother updating the ->nxttail[] array yet, instead + * we just reset the whole thing later on. */ - if (rdp->nxtlist != NULL && - rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { - struct rcu_head *oldhead; - struct rcu_head **oldtail; - struct rcu_head **newtail; - - oldhead = rdp->nxtlist; - oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; - *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; - newtail = rdp->nxttail[RCU_DONE_TAIL]; - for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { - if (receive_rdp->nxttail[i] == oldtail) - receive_rdp->nxttail[i] = newtail; - if (rdp->nxttail[i] == newtail) - rdp->nxttail[i] = &rdp->nxtlist; - } + if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { + *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; + rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = NULL; } /* - * Finally, put the rest of the callbacks at the end of the list. - * The ones that made it partway through get to start over: We - * cannot assume that grace periods are synchronized across CPUs. - * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but - * this does not seem compelling. Not yet, anyway.) + * Then move the ready-to-invoke callbacks to the orphanage, + * where some other CPU will pick them up. These will not be + * required to pass though another grace period: They are done. */ if (rdp->nxtlist != NULL) { - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - receive_rdp->nxttail[RCU_NEXT_TAIL] = - rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->n_cbs_adopted += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + *rsp->orphan_donetail = rdp->nxtlist; + rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; } + /* Finally, initialize the rcu_data structure's list to empty. */ + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; +} + +/* + * Adopt the RCU callbacks from the specified rcu_state structure's + * orphanage. The caller must hold the ->onofflock. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + /* - * Record a quiescent state for the dying CPU. This is safe - * only because we have already cleared out the callbacks. - * (Otherwise, the RCU core might try to schedule the invocation - * of callbacks on this now-offline CPU, which would be bad.) + * If there is an rcu_barrier() operation in progress, then + * only the task doing that operation is permitted to adopt + * callbacks. To do otherwise breaks rcu_barrier() and friends + * by causing them to fail to wait for the callbacks in the + * orphanage. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ + if (rsp->rcu_barrier_in_progress && + rsp->rcu_barrier_in_progress != current) + return; + + /* Do the accounting first. */ + rdp->qlen_lazy += rsp->qlen_lazy; + rdp->qlen += rsp->qlen; + rdp->n_cbs_adopted += rsp->qlen; + rsp->qlen_lazy = 0; + rsp->qlen = 0; + + /* + * We do not need a memory barrier here because the only way we + * can get here if there is an rcu_barrier() in flight is if + * we are the task doing the rcu_barrier(). + */ + + /* First adopt the ready-to-invoke callbacks. */ + if (rsp->orphan_donelist != NULL) { + *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; + for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = rsp->orphan_donetail; + rsp->orphan_donelist = NULL; + rsp->orphan_donetail = &rsp->orphan_donelist; + } + + /* And then adopt the callbacks that still need a grace period. */ + if (rsp->orphan_nxtlist != NULL) { + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; + rsp->orphan_nxtlist = NULL; + rsp->orphan_nxttail = &rsp->orphan_nxtlist; + } +} + +/* + * Trace the fact that this CPU is going offline. + */ +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +{ + RCU_TRACE(unsigned long mask); + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + + RCU_TRACE(mask = rdp->grpmask); trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), "cpuofl"); - rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); - /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ } /* * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. + * this fact from process context. Do the remainder of the cleanup, + * including orphaning the outgoing CPU's RCU callbacks, and also + * adopting them, if there is no _rcu_barrier() instance running. * There can only be one CPU hotplug operation at a time, so no other * CPU can be attempting to update rcu_cpu_kthread_task. */ @@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) unsigned long mask; int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); rcu_node_kthread_setaffinity(rnp, -1); - /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ + /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); + /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ + rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); + rcu_adopt_orphan_cbs(rsp); + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { @@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } @@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ - rdp->qlen_lazy -= count_lazy; - rdp->qlen -= count; - rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; @@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) else break; } + smp_mb(); /* List handling before counting for rcu_barrier(). */ + rdp->qlen_lazy -= count_lazy; + rdp->qlen -= count; + rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) @@ -1824,11 +1879,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; if (lazy) rdp->qlen_lazy++; + else + rcu_idle_count_callbacks_posted(); + smp_mb(); /* Count before adding callback for rcu_barrier(). */ + *rdp->nxttail[RCU_NEXT_TAIL] = head; + rdp->nxttail[RCU_NEXT_TAIL] = &head->next; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, @@ -1894,6 +1952,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/* + * Because a context switch is a grace period for RCU-sched and RCU-bh, + * any blocking grace-period wait automatically implies a grace period + * if there is only one CPU online at any point time during execution + * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds + * some overhead: RCU still operates correctly. + * + * Of course, sampling num_online_cpus() with preemption enabled can + * give erroneous results if there are concurrent CPU-hotplug operations. + * For example, given a demonic sequence of preemptions in num_online_cpus() + * and CPU-hotplug operations, there could be two or more CPUs online at + * all times, but num_online_cpus() might well return one (or even zero). + * + * However, all such demonic sequences require at least one CPU-offline + * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer + * is only a problem if there is an RCU read-side critical section executing + * throughout. But RCU-sched and RCU-bh read-side critical sections + * disable either preemption or bh, which prevents a CPU from going offline. + * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return + * that there is only one CPU when in fact there was more than one throughout + * is when there were no RCU readers in the system. If there are no + * RCU readers, the grace period by definition can be of zero length, + * regardless of the number of online CPUs. + */ +static inline int rcu_blocking_is_gp(void) +{ + might_sleep(); /* Check for RCU read-side critical section. */ + return num_online_cpus() <= 1; +} + /** * synchronize_sched - wait until an rcu-sched grace period has elapsed. * @@ -2167,11 +2257,10 @@ static int rcu_cpu_has_callbacks(int cpu) rcu_preempt_cpu_has_callbacks(cpu); } -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - +/* + * RCU callback function for _rcu_barrier(). If we are last, wake + * up the task executing _rcu_barrier(). + */ static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -2201,27 +2290,94 @@ static void _rcu_barrier(struct rcu_state *rsp, void (*call_rcu_func)(struct rcu_head *head, void (*func)(struct rcu_head *head))) { - BUG_ON(in_interrupt()); + int cpu; + unsigned long flags; + struct rcu_data *rdp; + struct rcu_head rh; + + init_rcu_head_on_stack(&rh); + /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); + + smp_mb(); /* Prevent any prior operations from leaking in. */ + /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. Note that on_each_cpu() disables irqs, which prevents - * any CPUs from coming online or going offline until each online - * CPU has queued its RCU-barrier callback. + * Initialize the count to one rather than to zero in order to + * avoid a too-soon return to zero in case of a short grace period + * (or preemption of this task). Also flag this task as doing + * an rcu_barrier(). This will prevent anyone else from adopting + * orphaned callbacks, which could cause otherwise failure if a + * CPU went offline and quickly came back online. To see this, + * consider the following sequence of events: + * + * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. + * 2. CPU 1 goes offline, orphaning its callbacks. + * 3. CPU 0 adopts CPU 1's orphaned callbacks. + * 4. CPU 1 comes back online. + * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. + * 6. Both rcu_barrier_callback() callbacks are invoked, awakening + * us -- but before CPU 1's orphaned callbacks are invoked!!! */ + init_completion(&rcu_barrier_completion); atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + raw_spin_lock_irqsave(&rsp->onofflock, flags); + rsp->rcu_barrier_in_progress = current; + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + + /* + * Force every CPU with callbacks to register a new callback + * that will tell us when all the preceding callbacks have + * been invoked. If an offline CPU has callbacks, wait for + * it to either come back online or to finish orphaning those + * callbacks. + */ + for_each_possible_cpu(cpu) { + preempt_disable(); + rdp = per_cpu_ptr(rsp->rda, cpu); + if (cpu_is_offline(cpu)) { + preempt_enable(); + while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) + schedule_timeout_interruptible(1); + } else if (ACCESS_ONCE(rdp->qlen)) { + smp_call_function_single(cpu, rcu_barrier_func, + (void *)call_rcu_func, 1); + preempt_enable(); + } else { + preempt_enable(); + } + } + + /* + * Now that all online CPUs have rcu_barrier_callback() callbacks + * posted, we can adopt all of the orphaned callbacks and place + * an rcu_barrier_callback() callback after them. When that is done, + * we are guaranteed to have an rcu_barrier_callback() callback + * following every callback that could possibly have been + * registered before _rcu_barrier() was called. + */ + raw_spin_lock_irqsave(&rsp->onofflock, flags); + rcu_adopt_orphan_cbs(rsp); + rsp->rcu_barrier_in_progress = NULL; + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + atomic_inc(&rcu_barrier_cpu_count); + smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ + call_rcu_func(&rh, rcu_barrier_callback); + + /* + * Now that we have an rcu_barrier_callback() callback on each + * CPU, and thus each counted, remove the initial count. + */ if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); + + /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rcu_barrier_completion); + + /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_barrier_mutex); + + destroy_rcu_head_on_stack(&rh); } /** @@ -2418,7 +2574,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) for (i = NUM_RCU_LVLS - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = RCU_FANOUT_LEAF; + rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0..7f5d138 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -29,18 +29,14 @@ #include <linux/seqlock.h> /* - * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. * In theory, it should be possible to add more levels straightforwardly. * In practice, this did work well going from three levels to four. * Of course, your mileage may vary. */ #define MAX_RCU_LVLS 4 -#if CONFIG_RCU_FANOUT > 16 -#define RCU_FANOUT_LEAF 16 -#else /* #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) -#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) @@ -371,6 +367,17 @@ struct rcu_state { raw_spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. */ + struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + /* need a grace period. */ + struct rcu_head **orphan_nxttail; /* Tail of above. */ + struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + /* are ready to invoke. */ + struct rcu_head **orphan_donetail; /* Tail of above. */ + long qlen_lazy; /* Number of lazy callbacks. */ + long qlen; /* Total number of callbacks. */ + struct task_struct *rcu_barrier_in_progress; + /* Task doing rcu_barrier(), */ + /* or NULL if no barrier. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); +static void rcu_idle_count_callbacks_posted(void); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464..2411000 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -static void rcu_preempt_note_context_switch(int cpu) +void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rdp = __this_cpu_ptr(rcu_preempt_state.rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(cpu); + rcu_preempt_qs(smp_processor_id()); local_irq_restore(flags); } @@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting == 0) - return; - t->rcu_read_lock_nesting = 1; - __rcu_read_unlock(); -} - #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static struct rcu_state *rcu_state = &rcu_sched_state; @@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Because preemptible RCU does not exist, we never have to check for - * CPUs being in quiescent states. - */ -static void rcu_preempt_note_context_switch(int cpu) -{ -} - -/* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ @@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu) { } +/* + * Don't bother keeping a running count of the number of RCU callbacks + * posted because CONFIG_RCU_FAST_NO_HZ=n. + */ +static void rcu_idle_count_callbacks_posted(void) +{ +} + #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ +/* Loop counter for rcu_prepare_for_idle(). */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); +/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ -static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ +/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ +static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); +/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ +static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); +/* Enable special processing on first attempt to enter dyntick-idle mode. */ +static DEFINE_PER_CPU(bool, rcu_idle_first_pass); +/* Running count of non-lazy callbacks posted, never decremented. */ +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); +/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ */ int rcu_needs_cpu(int cpu) { + /* Flag a new idle sojourn to the idle-entry state machine. */ + per_cpu(rcu_idle_first_pass, cpu) = 1; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(cpu)) return 0; @@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) } /* + * Handler for smp_call_function_single(). The only point of this + * handler is to wake the CPU up, so the handler does only tracing. + */ +void rcu_idle_demigrate(void *unused) +{ + trace_rcu_prep_idle("Demigrate"); +} + +/* * Timer handler used to force CPU to start pushing its remaining RCU * callbacks in the case where it entered dyntick-idle mode with callbacks * pending. The hander doesn't really need to do anything because the * real work is done upon re-entry to idle, or by the next scheduling-clock * interrupt should idle not be re-entered. + * + * One special case: the timer gets migrated without awakening the CPU + * on which the timer was scheduled on. In this case, we must wake up + * that CPU. We do so with smp_call_function_single(). */ -static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +static void rcu_idle_gp_timer_func(unsigned long cpu_in) { + int cpu = (int)cpu_in; + trace_rcu_prep_idle("Timer"); - return HRTIMER_NORESTART; + if (cpu != smp_processor_id()) + smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); + else + WARN_ON_ONCE(1); /* Getting here can hang the system... */ } /* @@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) */ static void rcu_prepare_for_idle_init(int cpu) { - static int firsttime = 1; - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); - - hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtp->function = rcu_idle_gp_timer_func; - if (firsttime) { - unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); - - rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); - upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); - rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); - firsttime = 0; - } + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_timer_func, cpu); + per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; + per_cpu(rcu_idle_first_pass, cpu) = 1; } /* @@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); + del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); + trace_rcu_prep_idle("Cleanup after idle"); } /* @@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { + struct timer_list *tp; + + /* + * If this is an idle re-entry, for example, due to use of + * RCU_NONIDLE() or the new idle-loop tracing API within the idle + * loop, then don't take any state-machine actions, unless the + * momentary exit from idle queued additional non-lazy callbacks. + * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * pending. + */ + if (!per_cpu(rcu_idle_first_pass, cpu) && + (per_cpu(rcu_nonlazy_posted, cpu) == + per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (rcu_cpu_has_callbacks(cpu)) { + tp = &per_cpu(rcu_idle_gp_timer, cpu); + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + } + return; + } + per_cpu(rcu_idle_first_pass, cpu) = 0; + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu) - 1; + /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. @@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu) per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_GP_DELAY; else - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_LAZY_GP_DELAY; + tp = &per_cpu(rcu_idle_gp_timer, cpu); + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ @@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu) trace_rcu_prep_idle("Callbacks drained"); } +/* + * Keep a running count of the number of non-lazy callbacks posted + * on this CPU. This running counter (which is never decremented) allows + * rcu_prepare_for_idle() to detect when something out of the idle loop + * posts a callback, even if an equal number of callbacks are invoked. + * Of course, callbacks should only be posted from within a trace event + * designed to be called from idle or from within RCU_NONIDLE(). + */ +static void rcu_idle_count_callbacks_posted(void) +{ + __this_cpu_add(rcu_nonlazy_posted, 1); +} + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); - sprintf(cp, "drain=%d %c timer=%lld", + sprintf(cp, "drain=%d %c timer=%lu", per_cpu(rcu_dyntick_drain, cpu), per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', - hrtimer_active(hrtp) - ? ktime_to_us(hrtimer_get_remaining(hrtp)) - : -1); + timer_pending(tltp) ? tltp->expires - jiffies : -1); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459ed..d4bc16d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); + rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4603b9d..5d89eb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2083,6 +2083,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ + rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); diff --git a/kernel/timer.c b/kernel/timer.c index a297ffc..837c552f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); * * mod_timer_pinned() is a way to update the expire field of an * active timer (if the timer is inactive it will be activated) - * and not allow the timer to be migrated to a different CPU. + * and to ensure that the timer is scheduled on the current CPU. + * + * Note that this does not prevent the timer from being migrated + * when the current CPU goes offline. If this is a problem for + * you, use CPU-hotplug notifiers to handle it correctly, for + * example, cancelling the timer when the corresponding CPU goes + * offline. * * mod_timer_pinned(timer, expires) is equivalent to: * diff --git a/lib/list_debug.c b/lib/list_debug.c index 982b850..3810b48 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -10,6 +10,7 @@ #include <linux/list.h> #include <linux/bug.h> #include <linux/kernel.h> +#include <linux/rculist.h> /* * Insert a new entry between two known consecutive entries. @@ -75,3 +76,24 @@ void list_del(struct list_head *entry) entry->prev = LIST_POISON2; } EXPORT_SYMBOL(list_del); + +/* + * RCU variants. + */ +void __list_add_rcu(struct list_head *new, + struct list_head *prev, struct list_head *next) +{ + WARN(next->prev != prev, + "list_add_rcu corruption. next->prev should be " + "prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next); + WARN(prev->next != next, + "list_add_rcu corruption. prev->next should be " + "next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev); + new->next = next; + new->prev = prev; + rcu_assign_pointer(list_next_rcu(prev), new); + next->prev = new; +} +EXPORT_SYMBOL(__list_add_rcu); |