Revert for r277213:

FreeBSD developers need more time to review patches in the surrounding areas like the TCP stack which are using MPSAFE callouts to restore distribution of callouts on multiple CPUs. Bump the __FreeBSD_version instead of reverting it. Suggested by: kmacy, adrian, glebius and kib Differential Revision: https://reviews.freebsd.org/D1438
author: hselasky <hselasky@FreeBSD.org> 2015-01-22 11:12:42 +0000
committer: hselasky <hselasky@FreeBSD.org> 2015-01-22 11:12:42 +0000
commit: c0aba3b50d494dc9fefa1cd1304481521fa05a36 (patch)
tree: 499d9197fe4fbf2671c76f17e92abf2f0cf51d05 /sys/kern
parent: 8925dffab199f6ca4955328774e9fa6d39e9f0c8 (diff)
download: FreeBSD-src-c0aba3b50d494dc9fefa1cd1304481521fa05a36.zip
FreeBSD-src-c0aba3b50d494dc9fefa1cd1304481521fa05a36.tar.gz
8 files changed, 633 insertions, 611 deletions
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 922959f..beb49bc 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -504,8 +504,7 @@ proc0_init(void *dummy __unused)
 
 	callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0);
 	callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
-	mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN);
-	callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0);
+	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 
 	/* Create credentials. */
 	p->p_ucred = crget();
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
index 8c2691b..2700a25 100644
--- a/sys/kern/kern_condvar.c
+++ b/sys/kern/kern_condvar.c
@@ -313,13 +313,15 @@ _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt,
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
-	sleepq_release(cvp);
 	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
 	if (lock != &Giant.lock_object) {
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_lock(cvp);
 	}
-	sleepq_lock(cvp);
 	rval = sleepq_timedwait(cvp, 0);
 
 #ifdef KTRACE
@@ -381,13 +383,15 @@ _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock,
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
 	    SLEEPQ_INTERRUPTIBLE, 0);
-	sleepq_release(cvp);
 	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
 	if (lock != &Giant.lock_object) {
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_lock(cvp);
 	}
-	sleepq_lock(cvp);
 	rval = sleepq_timedwait_sig(cvp, 0);
 
 #ifdef KTRACE
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index 38c8707..36a8470 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -210,11 +210,9 @@ sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
 	GIANT_SAVE();
 	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
 	    SLEEPQ_INTERRUPTIBLE : 0), queue);
-	if ((flags & LK_TIMELOCK) && timo) {
-		sleepq_release(&lk->lock_object);
+	if ((flags & LK_TIMELOCK) && timo)
 		sleepq_set_timeout(&lk->lock_object, timo);
-		sleepq_lock(&lk->lock_object);
-	}
+
 	/*
 	 * Decisional switch for real sleeping.
 	 */
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 61d9149..d0009b1 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -93,6 +93,8 @@ SCHED_STAT_DEFINE_VAR(turnstile,
     &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
 SCHED_STAT_DEFINE_VAR(sleepq,
     &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
+SCHED_STAT_DEFINE_VAR(sleepqtimo,
+    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
 SCHED_STAT_DEFINE_VAR(relinquish, 
     &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
 SCHED_STAT_DEFINE_VAR(needresched,
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 19bf4e8..9501ba2 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -236,16 +236,12 @@ _sleep(void *ident, struct lock_object *lock, int priority,
 	 * return from cursig().
 	 */
 	sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
+	if (sbt != 0)
+		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
 		sleepq_release(ident);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
-		if (sbt != 0)
-			sleepq_set_timeout_sbt(ident, sbt, pr, flags);
-		sleepq_lock(ident);
-	} else if (sbt != 0) {
-		sleepq_release(ident);
-		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 		sleepq_lock(ident);
 	}
 	if (sbt != 0 && catch)
@@ -310,11 +306,8 @@ msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg,
 	 * We put ourselves on the sleep queue and start our timeout.
 	 */
 	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
-	if (sbt != 0) {
-		sleepq_release(ident);
+	if (sbt != 0)
 		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
-		sleepq_lock(ident);
-	}
 
 	/*
 	 * Can't call ktrace with any spin locks held so it can lock the
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index b1e1a12..2d0b0d2 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -149,9 +149,6 @@ thread_ctor(void *mem, int size, void *arg, int flags)
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
-
-	mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN);
-	callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0);
 	return (0);
 }
 
@@ -165,10 +162,6 @@ thread_dtor(void *mem, int size, void *arg)
 
 	td = (struct thread *)mem;
 
-	/* make sure to drain any use of the "td->td_slpcallout" */
-	callout_drain(&td->td_slpcallout);
-	mtx_destroy(&td->td_slpmutex);
-
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
@@ -551,6 +544,7 @@ thread_link(struct thread *td, struct proc *p)
 	LIST_INIT(&td->td_lprof[0]);
 	LIST_INIT(&td->td_lprof[1]);
 	sigqueue_init(&td->td_sigqueue, p);
+	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
index 4336faa..13822fd 100644
--- a/sys/kern/kern_timeout.c
+++ b/sys/kern/kern_timeout.c
@@ -54,8 +54,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
-#include <sys/rmlock.h>
-#include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
@@ -126,216 +124,37 @@ SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_
  */
 u_int callwheelsize, callwheelmask;
 
-typedef void callout_mutex_op_t(struct lock_object *);
-typedef int callout_owned_op_t(struct lock_object *);
-
-struct callout_mutex_ops {
-	callout_mutex_op_t *lock;
-	callout_mutex_op_t *unlock;
-	callout_owned_op_t *owned;
-};
-
-enum {
-	CALLOUT_LC_UNUSED_0,
-	CALLOUT_LC_UNUSED_1,
-	CALLOUT_LC_UNUSED_2,
-	CALLOUT_LC_UNUSED_3,
-	CALLOUT_LC_SPIN,
-	CALLOUT_LC_MUTEX,
-	CALLOUT_LC_RW,
-	CALLOUT_LC_RM,
-};
-
-static void
-callout_mutex_op_none(struct lock_object *lock)
-{
-}
-
-static int
-callout_owned_op_none(struct lock_object *lock)
-{
-	return (0);
-}
-
-static void
-callout_mutex_lock(struct lock_object *lock)
-{
-	mtx_lock((struct mtx *)lock);
-}
-
-static void
-callout_mutex_unlock(struct lock_object *lock)
-{
-	mtx_unlock((struct mtx *)lock);
-}
-
-static void
-callout_mutex_lock_spin(struct lock_object *lock)
-{
-	mtx_lock_spin((struct mtx *)lock);
-}
-
-static void
-callout_mutex_unlock_spin(struct lock_object *lock)
-{
-	mtx_unlock_spin((struct mtx *)lock);
-}
-
-static int
-callout_mutex_owned(struct lock_object *lock)
-{
-	return (mtx_owned((struct mtx *)lock));
-}
-
-static void
-callout_rm_wlock(struct lock_object *lock)
-{
-	rm_wlock((struct rmlock *)lock);
-}
-
-static void
-callout_rm_wunlock(struct lock_object *lock)
-{
-	rm_wunlock((struct rmlock *)lock);
-}
-
-static int
-callout_rm_owned(struct lock_object *lock)
-{
-	return (rm_wowned((struct rmlock *)lock));
-}
-
-static void
-callout_rw_wlock(struct lock_object *lock)
-{
-	rw_wlock((struct rwlock *)lock);
-}
-
-static void
-callout_rw_wunlock(struct lock_object *lock)
-{
-	rw_wunlock((struct rwlock *)lock);
-}
-
-static int
-callout_rw_owned(struct lock_object *lock)
-{
-	return (rw_wowned((struct rwlock *)lock));
-}
-
-static const struct callout_mutex_ops callout_mutex_ops[8] = {
-	[CALLOUT_LC_UNUSED_0] = {
-		.lock = callout_mutex_op_none,
-		.unlock = callout_mutex_op_none,
-		.owned = callout_owned_op_none,
-	},
-	[CALLOUT_LC_UNUSED_1] = {
-		.lock = callout_mutex_op_none,
-		.unlock = callout_mutex_op_none,
-		.owned = callout_owned_op_none,
-	},
-	[CALLOUT_LC_UNUSED_2] = {
-		.lock = callout_mutex_op_none,
-		.unlock = callout_mutex_op_none,
-		.owned = callout_owned_op_none,
-	},
-	[CALLOUT_LC_UNUSED_3] = {
-		.lock = callout_mutex_op_none,
-		.unlock = callout_mutex_op_none,
-		.owned = callout_owned_op_none,
-	},
-	[CALLOUT_LC_SPIN] = {
-		.lock = callout_mutex_lock_spin,
-		.unlock = callout_mutex_unlock_spin,
-		.owned = callout_mutex_owned,
-	},
-	[CALLOUT_LC_MUTEX] = {
-		.lock = callout_mutex_lock,
-		.unlock = callout_mutex_unlock,
-		.owned = callout_mutex_owned,
-	},
-	[CALLOUT_LC_RW] = {
-		.lock = callout_rw_wlock,
-		.unlock = callout_rw_wunlock,
-		.owned = callout_rw_owned,
-	},
-	[CALLOUT_LC_RM] = {
-		.lock = callout_rm_wlock,
-		.unlock = callout_rm_wunlock,
-		.owned = callout_rm_owned,
-	},
-};
-
-static void
-callout_lock_client(int c_flags, struct lock_object *c_lock)
-{
-	callout_mutex_ops[CALLOUT_GET_LC(c_flags)].lock(c_lock);
-}
-
-static void
-callout_unlock_client(int c_flags, struct lock_object *c_lock)
-{
-	callout_mutex_ops[CALLOUT_GET_LC(c_flags)].unlock(c_lock);
-}
-
-#ifdef SMP
-static int
-callout_lock_owned_client(int c_flags, struct lock_object *c_lock)
-{
-	return (callout_mutex_ops[CALLOUT_GET_LC(c_flags)].owned(c_lock));
-}
-#endif
-
 /*
- * The callout CPU exec structure represent information necessary for
- * describing the state of callouts currently running on the CPU and
- * for handling deferred callout restarts.
- *
- * In particular, the first entry of the array cc_exec_entity holds
- * information for callouts running from the SWI thread context, while
- * the second one holds information for callouts running directly from
- * the hardware interrupt context.
+ * The callout cpu exec entities represent informations necessary for
+ * describing the state of callouts currently running on the CPU and the ones
+ * necessary for migrating callouts to the new callout cpu. In particular,
+ * the first entry of the array cc_exec_entity holds informations for callout
+ * running in SWI thread context, while the second one holds informations
+ * for callout running directly from hardware interrupt context.
+ * The cached informations are very important for deferring migration when
+ * the migrating callout is already running.
  */
 struct cc_exec {
-	/*
-	 * The "cc_curr" points to the currently executing callout and
-	 * is protected by the "cc_lock" spinlock. If no callback is
-	 * currently executing it is equal to "NULL".
-	 */
+	struct callout		*cc_next;
 	struct callout		*cc_curr;
-	/*
-	 * The "cc_restart_args" structure holds the argument for a
-	 * deferred callback restart and is protected by the "cc_lock"
-	 * spinlock. The structure is only valid if "cc_restart" is
-	 * "true". If "cc_restart" is "false" the information in the
-	 * "cc_restart_args" structure shall be ignored.
-	 */
-	struct callout_args	cc_restart_args;
-	bool			cc_restart;
-	/*
-	 * The "cc_cancel" variable allows the currently pending
-	 * callback to be atomically cancelled. This field is write
-	 * protected by the "cc_lock" spinlock.
-	 */
-	bool cc_cancel;
-	/*
-	 * The "cc_drain_fn" points to a function which shall be
-	 * called with the argument stored in "cc_drain_arg" when an
-	 * asynchronous drain is performed. This field is write
-	 * protected by the "cc_lock" spinlock.
-	 */
-	callout_func_t *cc_drain_fn;
-	void *cc_drain_arg;
+#ifdef SMP
+	void			(*ce_migration_func)(void *);
+	void			*ce_migration_arg;
+	int			ce_migration_cpu;
+	sbintime_t		ce_migration_time;
+	sbintime_t		ce_migration_prec;
+#endif
+	bool			cc_cancel;
+	bool			cc_waiting;
 };
 
 /*
- * There is one "struct callout_cpu" per CPU, holding all relevant
+ * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
-	struct callout		*cc_exec_next_dir;
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
@@ -347,7 +166,27 @@ struct callout_cpu {
 	char			cc_ktr_event_name[20];
 };
 
+#define	cc_exec_curr		cc_exec_entity[0].cc_curr
+#define	cc_exec_next		cc_exec_entity[0].cc_next
+#define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
+#define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
+#define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
+#define	cc_exec_next_dir	cc_exec_entity[1].cc_next
+#define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
+#define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
+
 #ifdef SMP
+#define	cc_migration_func	cc_exec_entity[0].ce_migration_func
+#define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
+#define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
+#define	cc_migration_time	cc_exec_entity[0].ce_migration_time
+#define	cc_migration_prec	cc_exec_entity[0].ce_migration_prec
+#define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
+#define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
+#define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
+#define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
+#define	cc_migration_prec_dir	cc_exec_entity[1].ce_migration_prec
+
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
@@ -372,9 +211,60 @@ static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
+/**
+ * Locked by cc_lock:
+ *   cc_curr         - If a callout is in progress, it is cc_curr.
+ *                     If cc_curr is non-NULL, threads waiting in
+ *                     callout_drain() will be woken up as soon as the
+ *                     relevant callout completes.
+ *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
+ *                     guarantees that the current callout will not run.
+ *                     The softclock() function sets this to 0 before it
+ *                     drops callout_lock to acquire c_lock, and it calls
+ *                     the handler only if curr_cancelled is still 0 after
+ *                     cc_lock is successfully acquired.
+ *   cc_waiting      - If a thread is waiting in callout_drain(), then
+ *                     callout_wait is nonzero.  Set only when
+ *                     cc_curr is non-NULL.
+ */
+
+/*
+ * Resets the execution entity tied to a specific callout cpu.
+ */
+static void
+cc_cce_cleanup(struct callout_cpu *cc, int direct)
+{
+
+	cc->cc_exec_entity[direct].cc_curr = NULL;
+	cc->cc_exec_entity[direct].cc_next = NULL;
+	cc->cc_exec_entity[direct].cc_cancel = false;
+	cc->cc_exec_entity[direct].cc_waiting = false;
+#ifdef SMP
+	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
+	cc->cc_exec_entity[direct].ce_migration_time = 0;
+	cc->cc_exec_entity[direct].ce_migration_prec = 0;
+	cc->cc_exec_entity[direct].ce_migration_func = NULL;
+	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
+#endif
+}
+
+/*
+ * Checks if migration is requested by a specific callout cpu.
+ */
+static int
+cc_cce_migrating(struct callout_cpu *cc, int direct)
+{
+
+#ifdef SMP
+	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
+#else
+	return (0);
+#endif
+}
+
 /*
- * Kernel low level callwheel initialization called from cpu0 during
- * kernel startup:
+ * Kernel low level callwheel initialization
+ * called on cpu0 during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
@@ -434,6 +324,8 @@ callout_cpu_init(struct callout_cpu *cc, int cpu)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = SBT_MAX;
+	for (i = 0; i < 2; i++)
+		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
 	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
@@ -441,11 +333,41 @@ callout_cpu_init(struct callout_cpu *cc, int cpu)
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
-		c->c_flags |= CALLOUT_LOCAL_ALLOC;
+		c->c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
+#ifdef SMP
+/*
+ * Switches the cpu tied to a specific callout.
+ * The function expects a locked incoming callout cpu and returns with
+ * locked outcoming callout cpu.
+ */
+static struct callout_cpu *
+callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
+{
+	struct callout_cpu *new_cc;
+
+	MPASS(c != NULL && cc != NULL);
+	CC_LOCK_ASSERT(cc);
+
+	/*
+	 * Avoid interrupts and preemption firing after the callout cpu
+	 * is blocked in order to avoid deadlocks as the new thread
+	 * may be willing to acquire the callout cpu lock.
+	 */
+	c->c_cpu = CPUBLOCK;
+	spinlock_enter();
+	CC_UNLOCK(cc);
+	new_cc = CC_CPU(new_cpu);
+	CC_LOCK(new_cc);
+	spinlock_exit();
+	c->c_cpu = new_cpu;
+	return (new_cc);
+}
+#endif
+
 /*
  * Start standard softclock thread.
  */
@@ -522,8 +444,9 @@ callout_process(sbintime_t now)
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
+
 	cc = CC_SELF();
-	CC_LOCK(cc);
+	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
@@ -626,7 +549,7 @@ next:
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
-	CC_UNLOCK(cc);
+	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
@@ -639,55 +562,49 @@ static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
-	cc = CC_CPU(c->c_cpu);
-	CC_LOCK(cc);
+	int cpu;
+
+	for (;;) {
+		cpu = c->c_cpu;
+#ifdef SMP
+		if (cpu == CPUBLOCK) {
+			while (c->c_cpu == CPUBLOCK)
+				cpu_spinwait();
+			continue;
+		}
+#endif
+		cc = CC_CPU(cpu);
+		CC_LOCK(cc);
+		if (cpu == c->c_cpu)
+			break;
+		CC_UNLOCK(cc);
+	}
 	return (cc);
 }
 
-static struct callout_cpu *
-callout_cc_add_locked(struct callout *c, struct callout_cpu *cc,
-    struct callout_args *coa, bool can_swap_cpu)
+static void
+callout_cc_add(struct callout *c, struct callout_cpu *cc,
+    sbintime_t sbt, sbintime_t precision, void (*func)(void *),
+    void *arg, int cpu, int flags)
 {
-#ifndef NO_EVENTTIMERS
-	sbintime_t sbt;
-#endif
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
-
-	/* update flags before swapping locks, if any */
-	c->c_flags &= ~(CALLOUT_PROCESSED | CALLOUT_DIRECT | CALLOUT_DEFRESTART);
-	if (coa->flags & C_DIRECT_EXEC)
-		c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING | CALLOUT_DIRECT);
-	else
-		c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
-
-#ifdef SMP
-	/*
-	 * Check if we are changing the CPU on which the callback
-	 * should be executed and if we have a lock protecting us:
-	 */
-	if (can_swap_cpu != false && coa->cpu != c->c_cpu &&
-	    callout_lock_owned_client(c->c_flags, c->c_lock) != 0) {
-		CC_UNLOCK(cc);
-		c->c_cpu = coa->cpu;
-		cc = callout_lock(c);
-	}
-#endif
-	if (coa->time < cc->cc_lastscan)
-		coa->time = cc->cc_lastscan;
-	c->c_arg = coa->arg;
-	c->c_func = coa->func;
-	c->c_time = coa->time;
-	c->c_precision = coa->precision;
-
+	if (sbt < cc->cc_lastscan)
+		sbt = cc->cc_lastscan;
+	c->c_arg = arg;
+	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+	if (flags & C_DIRECT_EXEC)
+		c->c_flags |= CALLOUT_DIRECT;
+	c->c_flags &= ~CALLOUT_PROCESSED;
+	c->c_func = func;
+	c->c_time = sbt;
+	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
-
-	/* Ensure we are first to be scanned, if called via a callback */
 	if (cc->cc_bucket == bucket)
 		cc->cc_exec_next_dir = c;
 #ifndef NO_EVENTTIMERS
@@ -700,16 +617,17 @@ callout_cc_add_locked(struct callout *c, struct callout_cpu *cc,
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
-		cpu_new_callout(coa->cpu, sbt, c->c_time);
+		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
-	return (cc);
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
+	if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
+		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
@@ -721,10 +639,20 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #endif
     int direct)
 {
-	callout_func_t *c_func;
+	struct rm_priotracker tracker;
+	void (*c_func)(void *);
 	void *c_arg;
+	struct lock_class *class;
 	struct lock_object *c_lock;
+	uintptr_t lock_status;
 	int c_flags;
+#ifdef SMP
+	struct callout_cpu *new_cc;
+	void (*new_func)(void *);
+	void *new_arg;
+	int flags, new_cpu;
+	sbintime_t new_prec, new_time;
+#endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
@@ -735,39 +663,37 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
 	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
 	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
+	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
+	lock_status = 0;
+	if (c->c_flags & CALLOUT_SHAREDLOCK) {
+		if (class == &lock_class_rm)
+			lock_status = (uintptr_t)&tracker;
+		else
+			lock_status = 1;
+	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_flags = c->c_flags;
-
-	/* remove pending bit */
-	c->c_flags &= ~CALLOUT_PENDING;
-
-	/* reset our local state */
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+		c->c_flags = CALLOUT_LOCAL_ALLOC;
+	else
+		c->c_flags &= ~CALLOUT_PENDING;
 	cc->cc_exec_entity[direct].cc_curr = c;
-	cc->cc_exec_entity[direct].cc_restart = false;
-	cc->cc_exec_entity[direct].cc_drain_fn = NULL;
-	cc->cc_exec_entity[direct].cc_drain_arg = NULL;
-
+	cc->cc_exec_entity[direct].cc_cancel = false;
+	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
-		cc->cc_exec_entity[direct].cc_cancel = false;
-		CC_UNLOCK(cc);
-
-		/* unlocked region for switching locks */
-
-		callout_lock_client(c_flags, c_lock);
-
+		class->lc_lock(c_lock, lock_status);
 		/*
-		 * Check if the callout may have been cancelled while
-		 * we were switching locks. Even though the callout is
-		 * specifying a lock, it might not be certain this
-		 * lock is locked when starting and stopping callouts.
+		 * The callout may have been cancelled
+		 * while we switched locks.
 		 */
-		CC_LOCK(cc);
 		if (cc->cc_exec_entity[direct].cc_cancel) {
-			callout_unlock_client(c_flags, c_lock);
-			goto skip_cc_locked;
+			class->lc_unlock(c_lock);
+			goto skip;
 		}
+		/* The callout cannot be stopped now. */
+		cc->cc_exec_entity[direct].cc_cancel = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
@@ -788,11 +714,6 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
-	/* The callout cannot be stopped now! */
-	cc->cc_exec_entity[direct].cc_cancel = true;
-	CC_UNLOCK(cc);
-
-	/* unlocked region */
 	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
 	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
@@ -819,40 +740,85 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #endif
 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
-
-	/*
-	 * At this point the callback structure might have been freed,
-	 * so we need to check the previously copied value of
-	 * "c->c_flags":
-	 */
 	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
-		callout_unlock_client(c_flags, c_lock);
-
+		class->lc_unlock(c_lock);
+skip:
 	CC_LOCK(cc);
-
-skip_cc_locked:
 	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
 	cc->cc_exec_entity[direct].cc_curr = NULL;
-
-	/* Check if there is anything which needs draining */
-	if (cc->cc_exec_entity[direct].cc_drain_fn != NULL) {
+	if (cc->cc_exec_entity[direct].cc_waiting) {
 		/*
-		 * Unlock the CPU callout last, so that any use of
-		 * structures belonging to the callout are complete:
+		 * There is someone waiting for the
+		 * callout to complete.
+		 * If the callout was scheduled for
+		 * migration just cancel it.
 		 */
+		if (cc_cce_migrating(cc, direct)) {
+			cc_cce_cleanup(cc, direct);
+
+			/*
+			 * It should be assert here that the callout is not
+			 * destroyed but that is not easy.
+			 */
+			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+		}
+		cc->cc_exec_entity[direct].cc_waiting = false;
 		CC_UNLOCK(cc);
-		/* call drain function unlocked */
-		cc->cc_exec_entity[direct].cc_drain_fn(
-		    cc->cc_exec_entity[direct].cc_drain_arg);
+		wakeup(&cc->cc_exec_entity[direct].cc_waiting);
 		CC_LOCK(cc);
-	} else if (c_flags & CALLOUT_LOCAL_ALLOC) {
-		/* return callout back to freelist */
-		callout_cc_del(c, cc);
-	} else if (cc->cc_exec_entity[direct].cc_restart) {
-		/* [re-]schedule callout, if any */
-		cc = callout_cc_add_locked(c, cc,
-		    &cc->cc_exec_entity[direct].cc_restart_args, false);
+	} else if (cc_cce_migrating(cc, direct)) {
+		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
+		    ("Migrating legacy callout %p", c));
+#ifdef SMP
+		/*
+		 * If the callout was scheduled for
+		 * migration just perform it now.
+		 */
+		new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
+		new_time = cc->cc_exec_entity[direct].ce_migration_time;
+		new_prec = cc->cc_exec_entity[direct].ce_migration_prec;
+		new_func = cc->cc_exec_entity[direct].ce_migration_func;
+		new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
+		cc_cce_cleanup(cc, direct);
+
+		/*
+		 * It should be assert here that the callout is not destroyed
+		 * but that is not easy.
+		 *
+		 * As first thing, handle deferred callout stops.
+		 */
+		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
+			CTR3(KTR_CALLOUT,
+			     "deferred cancelled %p func %p arg %p",
+			     c, new_func, new_arg);
+			callout_cc_del(c, cc);
+			return;
+		}
+		c->c_flags &= ~CALLOUT_DFRMIGRATION;
+
+		new_cc = callout_cpu_switch(c, cc, new_cpu);
+		flags = (direct) ? C_DIRECT_EXEC : 0;
+		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
+		    new_arg, new_cpu, flags);
+		CC_UNLOCK(new_cc);
+		CC_LOCK(cc);
+#else
+		panic("migration should not happen");
+#endif
 	}
+	/*
+	 * If the current callout is locally allocated (from
+	 * timeout(9)) then put it on the freelist.
+	 *
+	 * Note: we need to check the cached copy of c_flags because
+	 * if it was not local, then it's not safe to deref the
+	 * callout pointer.
+	 */
+	KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
+	    c->c_flags == CALLOUT_LOCAL_ALLOC,
+	    ("corrupted callout"));
+	if (c_flags & CALLOUT_LOCAL_ALLOC)
+		callout_cc_del(c, cc);
 }
 
 /*
@@ -933,11 +899,10 @@ timeout(timeout_t *ftn, void *arg, int to_ticks)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
+	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
-	callout_reset(new, to_ticks, ftn, arg);
-
 	return (handle);
 }
 
@@ -945,7 +910,6 @@ void
 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 {
 	struct callout_cpu *cc;
-	bool match;
 
 	/*
 	 * Check for a handle that was initialized
@@ -956,11 +920,9 @@ untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 		return;
 
 	cc = callout_lock(handle.callout);
-	match = (handle.callout->c_func == ftn && handle.callout->c_arg == arg);
-	CC_UNLOCK(cc);
-
-	if (match)
+	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
+	CC_UNLOCK(cc);
 }
 
 void
@@ -969,119 +931,6 @@ callout_handle_init(struct callout_handle *handle)
 	handle->callout = NULL;
 }
 
-static int
-callout_restart_async(struct callout *c, struct callout_args *coa,
-    callout_func_t *drain_fn, void *drain_arg)
-{
-	struct callout_cpu *cc;
-	int cancelled;
-	int direct;
-
-	cc = callout_lock(c);
-
-	/* Figure out if the callout is direct or not */
-	direct = ((c->c_flags & CALLOUT_DIRECT) != 0);
-
-	/*
-	 * Check if the callback is currently scheduled for
-	 * completion:
-	 */
-	if (cc->cc_exec_entity[direct].cc_curr == c) {
-		/*
-		 * Try to prevent the callback from running by setting
-		 * the "cc_cancel" variable to "true". Also check if
-		 * the callout was previously subject to a deferred
-		 * callout restart:
-		 */
-		if (cc->cc_exec_entity[direct].cc_cancel == false ||
-		    (c->c_flags & CALLOUT_DEFRESTART) != 0) {
-			cc->cc_exec_entity[direct].cc_cancel = true;
-			cancelled = 1;
-		} else {
-			cancelled = 0;
-		}
-
-		/*
-		 * Prevent callback restart if "callout_drain_xxx()"
-		 * is being called or we are stopping the callout or
-		 * the callback was preallocated by us:
-		 */
-		if (cc->cc_exec_entity[direct].cc_drain_fn != NULL ||
-		    coa == NULL || (c->c_flags & CALLOUT_LOCAL_ALLOC) != 0) {
-			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
-			    cancelled ? "cancelled and draining" : "draining",
-			    c, c->c_func, c->c_arg);
-
-			/* clear old flags, if any */
-			c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING |
-			    CALLOUT_DEFRESTART | CALLOUT_PROCESSED);
-
-			/* clear restart flag, if any */
-			cc->cc_exec_entity[direct].cc_restart = false;
-
-			/* set drain function, if any */
-			if (drain_fn != NULL) {
-				cc->cc_exec_entity[direct].cc_drain_fn = drain_fn;
-				cc->cc_exec_entity[direct].cc_drain_arg = drain_arg;
-				cancelled |= 2;		/* XXX define the value */
-			}
-		} else {
-			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
-			    cancelled ? "cancelled and restarting" : "restarting",
-			    c, c->c_func, c->c_arg);
-
-			/* get us back into the game */
-			c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING |
-			    CALLOUT_DEFRESTART);
-			c->c_flags &= ~CALLOUT_PROCESSED;
-
-			/* enable deferred restart */
-			cc->cc_exec_entity[direct].cc_restart = true;
-
-			/* store arguments for the deferred restart, if any */
-			cc->cc_exec_entity[direct].cc_restart_args = *coa;
-		}
-	} else {
-		/* stop callout */
-		if (c->c_flags & CALLOUT_PENDING) {
-			/*
-			 * The callback has not yet been executed, and
-			 * we simply just need to unlink it:
-			 */
-			if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
-				if (cc->cc_exec_next_dir == c)
-					cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
-				LIST_REMOVE(c, c_links.le);
-			} else {
-				TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
-			}
-			cancelled = 1;
-		} else {
-			cancelled = 0;
-		}
-
-		CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
-		    cancelled ? "rescheduled" : "scheduled",
-		    c, c->c_func, c->c_arg);
-
-		/* [re-]schedule callout, if any */
-		if (coa != NULL) {
-			cc = callout_cc_add_locked(c, cc, coa, true);
-		} else {
-			/* clear old flags, if any */
-			c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING |
-			    CALLOUT_DEFRESTART | CALLOUT_PROCESSED);
-
-			/* return callback to pre-allocated list, if any */
-			if ((c->c_flags & CALLOUT_LOCAL_ALLOC) && cancelled != 0) {
-				callout_cc_del(c, cc);
-			}
-		}
-	}
-	CC_UNLOCK(cc);
-	return (cancelled);
-}
-
 /*
  * New interface; clients allocate their own callout structures.
  *
@@ -1100,32 +949,25 @@ callout_restart_async(struct callout *c, struct callout_args *coa,
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
-    callout_func_t *ftn, void *arg, int cpu, int flags)
+    void (*ftn)(void *), void *arg, int cpu, int flags)
 {
-	struct callout_args coa;
-
-	/* store arguments for callout add function */
-	coa.func = ftn;
-	coa.arg = arg;
-	coa.precision = precision;
-	coa.flags = flags;
-	coa.cpu = cpu;
-
-	/* compute the rest of the arguments needed */
-	if (coa.flags & C_ABSOLUTE) {
-		coa.time = sbt;
-	} else {
-		sbintime_t pr;
+	sbintime_t to_sbt, pr;
+	struct callout_cpu *cc;
+	int cancelled, direct;
 
-		if ((coa.flags & C_HARDCLOCK) && (sbt < tick_sbt))
+	cancelled = 0;
+	if (flags & C_ABSOLUTE) {
+		to_sbt = sbt;
+	} else {
+		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
 			sbt = tick_sbt;
-		if ((coa.flags & C_HARDCLOCK) ||
+		if ((flags & C_HARDCLOCK) ||
 #ifdef NO_EVENTTIMERS
 		    sbt >= sbt_timethreshold) {
-			coa.time = getsbinuptime();
+			to_sbt = getsbinuptime();
 
 			/* Add safety belt for the case of hz > 1000. */
-			coa.time += tc_tick_sbt - tick_sbt;
+			to_sbt += tc_tick_sbt - tick_sbt;
 #else
 		    sbt >= sbt_tickthreshold) {
 			/*
@@ -1135,29 +977,101 @@ callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
 			 * active ones.
 			 */
 #ifdef __LP64__
-			coa.time = DPCPU_GET(hardclocktime);
+			to_sbt = DPCPU_GET(hardclocktime);
 #else
 			spinlock_enter();
-			coa.time = DPCPU_GET(hardclocktime);
+			to_sbt = DPCPU_GET(hardclocktime);
 			spinlock_exit();
 #endif
 #endif
-			if ((coa.flags & C_HARDCLOCK) == 0)
-				coa.time += tick_sbt;
+			if ((flags & C_HARDCLOCK) == 0)
+				to_sbt += tick_sbt;
 		} else
-			coa.time = sbinuptime();
-		if (SBT_MAX - coa.time < sbt)
-			coa.time = SBT_MAX;
+			to_sbt = sbinuptime();
+		if (SBT_MAX - to_sbt < sbt)
+			to_sbt = SBT_MAX;
 		else
-			coa.time += sbt;
-		pr = ((C_PRELGET(coa.flags) < 0) ? sbt >> tc_precexp :
-		    sbt >> C_PRELGET(coa.flags));
-		if (pr > coa.precision)
-			coa.precision = pr;
+			to_sbt += sbt;
+		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
+		    sbt >> C_PRELGET(flags));
+		if (pr > precision)
+			precision = pr;
+	}
+	/*
+	 * Don't allow migration of pre-allocated callouts lest they
+	 * become unbalanced.
+	 */
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+		cpu = c->c_cpu;
+	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
+	KASSERT(!direct || c->c_lock == NULL,
+	    ("%s: direct callout %p has lock", __func__, c));
+	cc = callout_lock(c);
+	if (cc->cc_exec_entity[direct].cc_curr == c) {
+		/*
+		 * We're being asked to reschedule a callout which is
+		 * currently in progress.  If there is a lock then we
+		 * can cancel the callout if it has not really started.
+		 */
+		if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel)
+			cancelled = cc->cc_exec_entity[direct].cc_cancel = true;
+		if (cc->cc_exec_entity[direct].cc_waiting) {
+			/*
+			 * Someone has called callout_drain to kill this
+			 * callout.  Don't reschedule.
+			 */
+			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+			    cancelled ? "cancelled" : "failed to cancel",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			return (cancelled);
+		}
+	}
+	if (c->c_flags & CALLOUT_PENDING) {
+		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+			if (cc->cc_exec_next_dir == c)
+				cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
+			LIST_REMOVE(c, c_links.le);
+		} else
+			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+		cancelled = 1;
+		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+	}
+
+#ifdef SMP
+	/*
+	 * If the callout must migrate try to perform it immediately.
+	 * If the callout is currently running, just defer the migration
+	 * to a more appropriate moment.
+	 */
+	if (c->c_cpu != cpu) {
+		if (cc->cc_exec_entity[direct].cc_curr == c) {
+			cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
+			cc->cc_exec_entity[direct].ce_migration_time
+			    = to_sbt;
+			cc->cc_exec_entity[direct].ce_migration_prec 
+			    = precision;
+			cc->cc_exec_entity[direct].ce_migration_func = ftn;
+			cc->cc_exec_entity[direct].ce_migration_arg = arg;
+			c->c_flags |= CALLOUT_DFRMIGRATION;
+			CTR6(KTR_CALLOUT,
+		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
+			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+			    (u_int)(to_sbt & 0xffffffff), cpu);
+			CC_UNLOCK(cc);
+			return (cancelled);
+		}
+		cc = callout_cpu_switch(c, cc, cpu);
 	}
+#endif
 
-	/* get callback started, if any */
-	return (callout_restart_async(c, &coa, NULL, NULL));
+	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
+	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
+	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+	    (u_int)(to_sbt & 0xffffffff));
+	CC_UNLOCK(cc);
+
+	return (cancelled);
 }
 
 /*
@@ -1176,105 +1090,204 @@ callout_schedule(struct callout *c, int to_ticks)
 }
 
 int
-callout_stop(struct callout *c)
+_callout_stop_safe(struct callout *c, int safe)
 {
-	/* get callback stopped, if any */
-	return (callout_restart_async(c, NULL, NULL, NULL));
-}
-
-static void
-callout_drain_function(void *arg)
-{
-	wakeup(arg);
-}
-
-int
-callout_drain_async(struct callout *c, callout_func_t *fn, void *arg)
-{
-	/* get callback stopped, if any */
-	return (callout_restart_async(c, NULL, fn, arg) & 2);
-}
-
-int
-callout_drain(struct callout *c)
-{
-	int cancelled;
-
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-	    "Draining callout");
-
-	callout_lock_client(c->c_flags, c->c_lock);
-
-	/* at this point the "c->c_cpu" field is not changing */
+	struct callout_cpu *cc, *old_cc;
+	struct lock_class *class;
+	int direct, sq_locked, use_lock;
 
-	cancelled = callout_drain_async(c, &callout_drain_function, c);
-
-	if (cancelled != 0) {
-		struct callout_cpu *cc;
-		int direct;
+	/*
+	 * Some old subsystems don't hold Giant while running a callout_stop(),
+	 * so just discard this check for the moment.
+	 */
+	if (!safe && c->c_lock != NULL) {
+		if (c->c_lock == &Giant.lock_object)
+			use_lock = mtx_owned(&Giant);
+		else {
+			use_lock = 1;
+			class = LOCK_CLASS(c->c_lock);
+			class->lc_assert(c->c_lock, LA_XLOCKED);
+		}
+	} else
+		use_lock = 0;
+	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
+	sq_locked = 0;
+	old_cc = NULL;
+again:
+	cc = callout_lock(c);
 
-		CTR3(KTR_CALLOUT, "need to drain %p func %p arg %p",
-		    c, c->c_func, c->c_arg);
+	/*
+	 * If the callout was migrating while the callout cpu lock was
+	 * dropped,  just drop the sleepqueue lock and check the states
+	 * again.
+	 */
+	if (sq_locked != 0 && cc != old_cc) {
+#ifdef SMP
+		CC_UNLOCK(cc);
+		sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
+		sq_locked = 0;
+		old_cc = NULL;
+		goto again;
+#else
+		panic("migration should not happen");
+#endif
+	}
 
-		cc = callout_lock(c);
-		direct = ((c->c_flags & CALLOUT_DIRECT) != 0);
+	/*
+	 * If the callout isn't pending, it's not on the queue, so
+	 * don't attempt to remove it from the queue.  We can try to
+	 * stop it by other means however.
+	 */
+	if (!(c->c_flags & CALLOUT_PENDING)) {
+		c->c_flags &= ~CALLOUT_ACTIVE;
 
 		/*
-		 * We've gotten our callout CPU lock, it is safe to
-		 * drop the initial lock:
+		 * If it wasn't on the queue and it isn't the current
+		 * callout, then we can't stop it, so just bail.
 		 */
-		callout_unlock_client(c->c_flags, c->c_lock);
-
-		/* Wait for drain to complete */
+		if (cc->cc_exec_entity[direct].cc_curr != c) {
+			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			if (sq_locked)
+				sleepq_release(
+				    &cc->cc_exec_entity[direct].cc_waiting);
+			return (0);
+		}
 
-		while (cc->cc_exec_entity[direct].cc_curr == c)
-			msleep_spin(c, (struct mtx *)&cc->cc_lock, "codrain", 0);
+		if (safe) {
+			/*
+			 * The current callout is running (or just
+			 * about to run) and blocking is allowed, so
+			 * just wait for the current invocation to
+			 * finish.
+			 */
+			while (cc->cc_exec_entity[direct].cc_curr == c) {
+				/*
+				 * Use direct calls to sleepqueue interface
+				 * instead of cv/msleep in order to avoid
+				 * a LOR between cc_lock and sleepqueue
+				 * chain spinlocks.  This piece of code
+				 * emulates a msleep_spin() call actually.
+				 *
+				 * If we already have the sleepqueue chain
+				 * locked, then we can safely block.  If we
+				 * don't already have it locked, however,
+				 * we have to drop the cc_lock to lock
+				 * it.  This opens several races, so we
+				 * restart at the beginning once we have
+				 * both locks.  If nothing has changed, then
+				 * we will end up back here with sq_locked
+				 * set.
+				 */
+				if (!sq_locked) {
+					CC_UNLOCK(cc);
+					sleepq_lock(
+					&cc->cc_exec_entity[direct].cc_waiting);
+					sq_locked = 1;
+					old_cc = cc;
+					goto again;
+				}
 
+				/*
+				 * Migration could be cancelled here, but
+				 * as long as it is still not sure when it
+				 * will be packed up, just let softclock()
+				 * take care of it.
+				 */
+				cc->cc_exec_entity[direct].cc_waiting = true;
+				DROP_GIANT();
+				CC_UNLOCK(cc);
+				sleepq_add(
+				    &cc->cc_exec_entity[direct].cc_waiting,
+				    &cc->cc_lock.lock_object, "codrain",
+				    SLEEPQ_SLEEP, 0);
+				sleepq_wait(
+				    &cc->cc_exec_entity[direct].cc_waiting,
+					     0);
+				sq_locked = 0;
+				old_cc = NULL;
+
+				/* Reacquire locks previously released. */
+				PICKUP_GIANT();
+				CC_LOCK(cc);
+			}
+		} else if (use_lock &&
+			    !cc->cc_exec_entity[direct].cc_cancel) {
+			/*
+			 * The current callout is waiting for its
+			 * lock which we hold.  Cancel the callout
+			 * and return.  After our caller drops the
+			 * lock, the callout will be skipped in
+			 * softclock().
+			 */
+			cc->cc_exec_entity[direct].cc_cancel = true;
+			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			KASSERT(!cc_cce_migrating(cc, direct),
+			    ("callout wrongly scheduled for migration"));
+			CC_UNLOCK(cc);
+			KASSERT(!sq_locked, ("sleepqueue chain locked"));
+			return (1);
+		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
+			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			return (1);
+		}
+		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+		    c, c->c_func, c->c_arg);
 		CC_UNLOCK(cc);
-	} else {
-		callout_unlock_client(c->c_flags, c->c_lock);
+		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
+		return (0);
 	}
+	if (sq_locked)
+		sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
+
+	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
+	if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+		if (cc->cc_exec_next_dir == c)
+			cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
+		LIST_REMOVE(c, c_links.le);
+	} else
+		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+	callout_cc_del(c, cc);
 
-	return (cancelled & 1);
+	CC_UNLOCK(cc);
+	return (1);
 }
 
 void
 callout_init(struct callout *c, int mpsafe)
 {
+	bzero(c, sizeof *c);
 	if (mpsafe) {
-		_callout_init_lock(c, NULL, CALLOUT_RETURNUNLOCKED);
+		c->c_lock = NULL;
+		c->c_flags = CALLOUT_RETURNUNLOCKED;
 	} else {
-		_callout_init_lock(c, &Giant.lock_object, 0);
+		c->c_lock = &Giant.lock_object;
+		c->c_flags = 0;
 	}
+	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
 {
 	bzero(c, sizeof *c);
-	KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0,
-	    ("callout_init_lock: bad flags 0x%08x", flags));
-	flags &= CALLOUT_RETURNUNLOCKED;
-	if (lock != NULL) {
-		struct lock_class *class = LOCK_CLASS(lock);
-		if (class == &lock_class_mtx_sleep)
-			flags |= CALLOUT_SET_LC(CALLOUT_LC_MUTEX);
-		else if (class == &lock_class_mtx_spin)
-			flags |= CALLOUT_SET_LC(CALLOUT_LC_SPIN);
-		else if (class == &lock_class_rm)
-			flags |= CALLOUT_SET_LC(CALLOUT_LC_RM);
-		else if (class == &lock_class_rw)
-			flags |= CALLOUT_SET_LC(CALLOUT_LC_RW);
-		else
-			panic("callout_init_lock: Unsupported lock class '%s'\n", class->lc_name);
-	} else {
-		flags |= CALLOUT_SET_LC(CALLOUT_LC_UNUSED_0);
-	}
 	c->c_lock = lock;
-	c->c_flags = flags;
+	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
+	    ("callout_init_lock: bad flags %d", flags));
+	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
+	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
+	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
+	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
+	    __func__));
+	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c
index 18dc2a05..bbbec920 100644
--- a/sys/kern/subr_sleepqueue.c
+++ b/sys/kern/subr_sleepqueue.c
@@ -152,8 +152,7 @@ static uma_zone_t sleepq_zone;
  */
 static int	sleepq_catch_signals(void *wchan, int pri);
 static int	sleepq_check_signals(void);
-static int	sleepq_check_timeout(struct thread *);
-static void	sleepq_stop_timeout(struct thread *);
+static int	sleepq_check_timeout(void);
 #ifdef INVARIANTS
 static void	sleepq_dtor(void *mem, int size, void *arg);
 #endif
@@ -374,14 +373,17 @@ void
 sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
     int flags)
 {
+	struct sleepqueue_chain *sc;
 	struct thread *td;
 
 	td = curthread;
-
-	mtx_lock_spin(&td->td_slpmutex);
+	sc = SC_LOOKUP(wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+	MPASS(TD_ON_SLEEPQ(td));
+	MPASS(td->td_sleepqueue == NULL);
+	MPASS(wchan != NULL);
 	callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
 	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
-	mtx_unlock_spin(&td->td_slpmutex);
 }
 
 /*
@@ -557,8 +559,11 @@ sleepq_switch(void *wchan, int pri)
  * Check to see if we timed out.
  */
 static int
-sleepq_check_timeout(struct thread *td)
+sleepq_check_timeout(void)
 {
+	struct thread *td;
+
+	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
@@ -568,18 +573,25 @@ sleepq_check_timeout(struct thread *td)
 		td->td_flags &= ~TDF_TIMEOUT;
 		return (EWOULDBLOCK);
 	}
-	return (0);
-}
 
-/*
- * Atomically stop the timeout by using a mutex.
- */
-static void
-sleepq_stop_timeout(struct thread *td)
-{
-	mtx_lock_spin(&td->td_slpmutex);
-	callout_stop(&td->td_slpcallout);
-	mtx_unlock_spin(&td->td_slpmutex);
+	/*
+	 * If TDF_TIMOFAIL is set, the timeout ran after we had
+	 * already been woken up.
+	 */
+	if (td->td_flags & TDF_TIMOFAIL)
+		td->td_flags &= ~TDF_TIMOFAIL;
+
+	/*
+	 * If callout_stop() fails, then the timeout is running on
+	 * another CPU, so synchronize with it to avoid having it
+	 * accidentally wake up a subsequent sleep.
+	 */
+	else if (callout_stop(&td->td_slpcallout) == 0) {
+		td->td_flags |= TDF_TIMEOUT;
+		TD_SET_SLEEPING(td);
+		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
+	}
+	return (0);
 }
 
 /*
@@ -652,11 +664,9 @@ sleepq_timedwait(void *wchan, int pri)
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
-	rval = sleepq_check_timeout(td);
+	rval = sleepq_check_timeout();
 	thread_unlock(td);
 
-	sleepq_stop_timeout(td);
-
 	return (rval);
 }
 
@@ -667,18 +677,12 @@ sleepq_timedwait(void *wchan, int pri)
 int
 sleepq_timedwait_sig(void *wchan, int pri)
 {
-	struct thread *td;
 	int rcatch, rvalt, rvals;
 
-	td = curthread;
-
 	rcatch = sleepq_catch_signals(wchan, pri);
-	rvalt = sleepq_check_timeout(td);
+	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
-	thread_unlock(td);
-
-	sleepq_stop_timeout(td);
-
+	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
@@ -885,49 +889,64 @@ sleepq_broadcast(void *wchan, int flags, int pri, int queue)
 static void
 sleepq_timeout(void *arg)
 {
-	struct thread *td = arg;
-	int wakeup_swapper = 0;
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+	struct thread *td;
+	void *wchan;
+	int wakeup_swapper;
 
+	td = arg;
+	wakeup_swapper = 0;
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
-	/* Handle the three cases which can happen */
-
+	/*
+	 * First, see if the thread is asleep and get the wait channel if
+	 * it is.
+	 */
 	thread_lock(td);
-	if (TD_ON_SLEEPQ(td)) {
-		if (TD_IS_SLEEPING(td)) {
-			struct sleepqueue_chain *sc;
-			struct sleepqueue *sq;
-			void *wchan;
+	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
+		wchan = td->td_wchan;
+		sc = SC_LOOKUP(wchan);
+		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
+		sq = sleepq_lookup(wchan);
+		MPASS(sq != NULL);
+		td->td_flags |= TDF_TIMEOUT;
+		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
+		thread_unlock(td);
+		if (wakeup_swapper)
+			kick_proc0();
+		return;
+	}
 
-			/*
-			 * Case I - thread is asleep and needs to be
-			 * awoken:
-			 */
-			wchan = td->td_wchan;
-			sc = SC_LOOKUP(wchan);
-			THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
-			sq = sleepq_lookup(wchan);
-			MPASS(sq != NULL);
-			td->td_flags |= TDF_TIMEOUT;
-			wakeup_swapper = sleepq_resume_thread(sq, td, 0);
-		} else {
-			/*
-			 * Case II - cancel going to sleep by setting
-			 * the timeout flag because the target thread
-			 * is not asleep yet. It can be on another CPU
-			 * in between sleepq_add() and one of the
-			 * sleepq_*wait*() routines or it can be in
-			 * sleepq_catch_signals().
-			 */
-			td->td_flags |= TDF_TIMEOUT;
-		}
-	} else {
-		/*
-		 * Case III - thread is already woken up by a wakeup
-		 * call and should not timeout. Nothing to do!
-		 */
+	/*
+	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
+	 * can either be on another CPU in between sleepq_add() and
+	 * one of the sleepq_*wait*() routines or it can be in
+	 * sleepq_catch_signals().
+	 */
+	if (TD_ON_SLEEPQ(td)) {
+		td->td_flags |= TDF_TIMEOUT;
+		thread_unlock(td);
+		return;
 	}
+
+	/*
+	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
+	 * then the other thread has already yielded to us, so clear
+	 * the flag and resume it.  If TDF_TIMEOUT is not set, then the
+	 * we know that the other thread is not on a sleep queue, but it
+	 * hasn't resumed execution yet.  In that case, set TDF_TIMOFAIL
+	 * to let it know that the timeout has already run and doesn't
+	 * need to be canceled.
+	 */
+	if (td->td_flags & TDF_TIMEOUT) {
+		MPASS(TD_IS_SLEEPING(td));
+		td->td_flags &= ~TDF_TIMEOUT;
+		TD_CLR_SLEEPING(td);
+		wakeup_swapper = setrunnable(td);
+	} else
+		td->td_flags |= TDF_TIMOFAIL;
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
author	hselasky <hselasky@FreeBSD.org>	2015-01-22 11:12:42 +0000
committer	hselasky <hselasky@FreeBSD.org>	2015-01-22 11:12:42 +0000
commit	c0aba3b50d494dc9fefa1cd1304481521fa05a36 (patch)
tree	499d9197fe4fbf2671c76f17e92abf2f0cf51d05 /sys/kern
parent	8925dffab199f6ca4955328774e9fa6d39e9f0c8 (diff)
download	FreeBSD-src-c0aba3b50d494dc9fefa1cd1304481521fa05a36.zip FreeBSD-src-c0aba3b50d494dc9fefa1cd1304481521fa05a36.tar.gz