19 files changed, 208 insertions, 143 deletions
diff --git a/sys/ddb/db_ps.c b/sys/ddb/db_ps.c
index efbc8dc..e030f47 100644
--- a/sys/ddb/db_ps.c
+++ b/sys/ddb/db_ps.c
@@ -371,8 +371,13 @@ DB_SHOW_COMMAND(thread, db_show_thread)
 		db_printf(" lock: %s  turnstile: %p\n", td->td_lockname,
 		    td->td_blocked);
 	if (TD_ON_SLEEPQ(td))
-		db_printf(" wmesg: %s  wchan: %p\n", td->td_wmesg,
-		    td->td_wchan);
+		db_printf(
+	    " wmesg: %s  wchan: %p sleeptimo %lx. %jx (curr %lx. %jx)\n",
+		    td->td_wmesg, td->td_wchan,
+		    (long)sbttobt(td->td_sleeptimo).sec,
+		    (uintmax_t)sbttobt(td->td_sleeptimo).frac,
+		    (long)sbttobt(sbinuptime()).sec,
+		    (uintmax_t)sbttobt(sbinuptime()).frac);
 	db_printf(" priority: %d\n", td->td_priority);
 	db_printf(" container lock: %s (%p)\n", lock->lo_name, lock);
 }
diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
index c4415c1..2573f15 100644
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -76,11 +76,11 @@
 
 #define	MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x7
 #define	MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
-#define	MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xd
+#define	MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xe
 
 #define	MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE                0x7
 #define	MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
-#define	MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xd
+#define	MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xe
 
 /* freeBSD HW LRO is limited by 16KB - the size of max mbuf */
 #define	MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 MJUM16BYTES
diff --git a/sys/dev/usb/controller/xhci.c b/sys/dev/usb/controller/xhci.c
index 0b708b8..2376cee2 100644
--- a/sys/dev/usb/controller/xhci.c
+++ b/sys/dev/usb/controller/xhci.c
@@ -2227,7 +2227,11 @@ xhci_setup_generic_chain(struct usb_xfer *xfer)
 		 * Send a DATA1 message and invert the current
 		 * endpoint direction.
 		 */
+#ifdef XHCI_STEP_STATUS_STAGE
 		temp.step_td = (xfer->nframes != 0);
+#else
+		temp.step_td = 0;
+#endif
 		temp.direction = UE_GET_DIR(xfer->endpointno) ^ UE_DIR_IN;
 		temp.len = 0;
 		temp.pc = NULL;
@@ -3873,12 +3877,10 @@ xhci_configure_reset_endpoint(struct usb_xfer *xfer)
 
 	xhci_configure_mask(udev, (1U << epno) | 1U, 0);
 
-	err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);
-
-	if (err != 0)
-		DPRINTF("Could not configure endpoint %u\n", epno);
-
-	err = xhci_cmd_configure_ep(sc, buf_inp.physaddr, 0, index);
+	if (epno > 1)
+		err = xhci_cmd_configure_ep(sc, buf_inp.physaddr, 0, index);
+	else
+		err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);
 
 	if (err != 0)
 		DPRINTF("Could not configure endpoint %u\n", epno);
@@ -4261,6 +4263,10 @@ xhci_device_state_change(struct usb_device *udev)
 
 		sc->sc_hw.devs[index].state = XHCI_ST_ADDRESSED;
 
+		/* set configure mask to slot only */
+		xhci_configure_mask(udev, 1, 0);
+
+		/* deconfigure all endpoints, except EP0 */
 		err = xhci_cmd_configure_ep(sc, 0, 1, index);
 
 		if (err) {
diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c
index 49ac7bf..fc4ca83a 100644
--- a/sys/kern/kern_clocksource.c
+++ b/sys/kern/kern_clocksource.c
@@ -211,7 +211,7 @@ handleevents(sbintime_t now, int fake)
 	} else
 		state->nextprof = state->nextstat;
 	if (now >= state->nextcallopt) {
-		state->nextcall = state->nextcallopt = INT64_MAX;
+		state->nextcall = state->nextcallopt = SBT_MAX;
 		callout_process(now);
 	}
 
@@ -492,7 +492,7 @@ configtimer(int start)
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			state->now = now;
 			if (!smp_started && cpu != CPU_FIRST())
-				state->nextevent = INT64_MAX;
+				state->nextevent = SBT_MAX;
 			else
 				state->nextevent = next;
 			if (periodic)
@@ -580,8 +580,8 @@ cpu_initclocks_bsp(void)
 	CPU_FOREACH(cpu) {
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
-		state->nextcall = INT64_MAX;
-		state->nextcallopt = INT64_MAX;
+		state->nextcall = SBT_MAX;
+		state->nextcallopt = SBT_MAX;
 	}
 	periodic = want_periodic;
 	/* Grab requested timer or the best of present. */
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index e4bf40b..fba163f 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -1471,7 +1471,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
 				rsbt = tstosbt(*tsp);
 				if (TIMESEL(&asbt, rsbt))
 					asbt += tc_tick_sbt;
-				if (asbt <= INT64_MAX - rsbt)
+				if (asbt <= SBT_MAX - rsbt)
 					asbt += rsbt;
 				else
 					asbt = 0;
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 8fa6bcd..7877fab 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -470,6 +470,7 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	td2->td_su = NULL;
+	td2->td_sleeptimo = 0;
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
index fb46025..4c77f4f 100644
--- a/sys/kern/kern_kthread.c
+++ b/sys/kern/kern_kthread.c
@@ -272,6 +272,7 @@ kthread_add(void (*func)(void *), void *arg, struct proc *p,
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	newtd->td_su = NULL;
+	newtd->td_sleeptimo = 0;
 	bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index b01aecb..74050ff 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -229,6 +229,7 @@ thread_create(struct thread *td, struct rtprio *rtp,
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	newtd->td_su = NULL;
+	newtd->td_sleeptimo = 0;
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 	newtd->td_proc = td->td_proc;
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 2f8382c..7b04d96 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -319,7 +319,7 @@ thread_reap(void)
 
 	/*
 	 * Don't even bother to lock if none at this instant,
-	 * we really don't care about the next instant..
+	 * we really don't care about the next instant.
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&zombie_lock);
@@ -383,6 +383,7 @@ thread_free(struct thread *td)
 	cpu_thread_free(td);
 	if (td->td_kstack != 0)
 		vm_thread_dispose(td);
+	callout_drain(&td->td_slpcallout);
 	uma_zfree(thread_zone, td);
 }
 
@@ -524,6 +525,7 @@ thread_wait(struct proc *p)
 	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	crfree(td->td_ucred);
+	callout_drain(&td->td_slpcallout);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
index 9aa11ba..7e5aab7 100644
--- a/sys/kern/kern_timeout.c
+++ b/sys/kern/kern_timeout.c
@@ -296,7 +296,7 @@ callout_cpu_init(struct callout_cpu *cc, int cpu)
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
-	cc->cc_firstevent = INT64_MAX;
+	cc->cc_firstevent = SBT_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
@@ -569,8 +569,8 @@ callout_cc_add(struct callout *c, struct callout_cpu *cc,
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
-	if (INT64_MAX - c->c_time < c->c_precision)
-		c->c_precision = INT64_MAX - c->c_time;
+	if (SBT_MAX - c->c_time < c->c_precision)
+		c->c_precision = SBT_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
@@ -896,6 +896,56 @@ callout_handle_init(struct callout_handle *handle)
 	handle->callout = NULL;
 }
 
+void
+callout_when(sbintime_t sbt, sbintime_t precision, int flags,
+    sbintime_t *res, sbintime_t *prec_res)
+{
+	sbintime_t to_sbt, to_pr;
+
+	if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) {
+		*res = sbt;
+		*prec_res = precision;
+		return;
+	}
+	if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt)
+		sbt = tick_sbt;
+	if ((flags & C_HARDCLOCK) != 0 ||
+#ifdef NO_EVENTTIMERS
+	    sbt >= sbt_timethreshold) {
+		to_sbt = getsbinuptime();
+
+		/* Add safety belt for the case of hz > 1000. */
+		to_sbt += tc_tick_sbt - tick_sbt;
+#else
+	    sbt >= sbt_tickthreshold) {
+		/*
+		 * Obtain the time of the last hardclock() call on
+		 * this CPU directly from the kern_clocksource.c.
+		 * This value is per-CPU, but it is equal for all
+		 * active ones.
+		 */
+#ifdef __LP64__
+		to_sbt = DPCPU_GET(hardclocktime);
+#else
+		spinlock_enter();
+		to_sbt = DPCPU_GET(hardclocktime);
+		spinlock_exit();
+#endif
+#endif
+		if ((flags & C_HARDCLOCK) == 0)
+			to_sbt += tick_sbt;
+	} else
+		to_sbt = sbinuptime();
+	if (SBT_MAX - to_sbt < sbt)
+		to_sbt = SBT_MAX;
+	else
+		to_sbt += sbt;
+	*res = to_sbt;
+	to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
+	    sbt >> C_PRELGET(flags));
+	*prec_res = to_pr > precision ? to_pr : precision;
+}
+
 /*
  * New interface; clients allocate their own callout structures.
  *
@@ -913,10 +963,10 @@ callout_handle_init(struct callout_handle *handle)
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
-callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
+callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
-	sbintime_t to_sbt, pr;
+	sbintime_t to_sbt, precision;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 	int ignore_cpu=0;
@@ -929,47 +979,8 @@ callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
 		/* Invalid CPU spec */
 		panic("Invalid CPU in callout %d", cpu);
 	}
-	if (flags & C_ABSOLUTE) {
-		to_sbt = sbt;
-	} else {
-		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
-			sbt = tick_sbt;
-		if ((flags & C_HARDCLOCK) ||
-#ifdef NO_EVENTTIMERS
-		    sbt >= sbt_timethreshold) {
-			to_sbt = getsbinuptime();
+	callout_when(sbt, prec, flags, &to_sbt, &precision);
 
-			/* Add safety belt for the case of hz > 1000. */
-			to_sbt += tc_tick_sbt - tick_sbt;
-#else
-		    sbt >= sbt_tickthreshold) {
-			/*
-			 * Obtain the time of the last hardclock() call on
-			 * this CPU directly from the kern_clocksource.c.
-			 * This value is per-CPU, but it is equal for all
-			 * active ones.
-			 */
-#ifdef __LP64__
-			to_sbt = DPCPU_GET(hardclocktime);
-#else
-			spinlock_enter();
-			to_sbt = DPCPU_GET(hardclocktime);
-			spinlock_exit();
-#endif
-#endif
-			if ((flags & C_HARDCLOCK) == 0)
-				to_sbt += tick_sbt;
-		} else
-			to_sbt = sbinuptime();
-		if (INT64_MAX - to_sbt < sbt)
-			to_sbt = INT64_MAX;
-		else
-			to_sbt += sbt;
-		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
-		    sbt >> C_PRELGET(flags));
-		if (pr > precision)
-			precision = pr;
-	}
 	/* 
 	 * This flag used to be added by callout_cc_add, but the
 	 * first time you call this we could end up with the
diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c
index c490460..4941b47 100644
--- a/sys/kern/subr_sleepqueue.c
+++ b/sys/kern/subr_sleepqueue.c
@@ -361,6 +361,7 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
 {
 	struct sleepqueue_chain *sc;
 	struct thread *td;
+	sbintime_t pr1;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
@@ -368,8 +369,14 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
-	callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
-	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
+	KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx",
+	    td->td_tid, td, (uintmax_t)td->td_sleeptimo));
+	thread_lock(td);
+	callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1);
+	thread_unlock(td);
+	callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1,
+	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC |
+	    C_DIRECT_EXEC);
 }
 
 /*
@@ -548,37 +555,36 @@ static int
 sleepq_check_timeout(void)
 {
 	struct thread *td;
+	int res;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
-	 * If TDF_TIMEOUT is set, we timed out.
+	 * If TDF_TIMEOUT is set, we timed out.  But recheck
+	 * td_sleeptimo anyway.
 	 */
-	if (td->td_flags & TDF_TIMEOUT) {
-		td->td_flags &= ~TDF_TIMEOUT;
-		return (EWOULDBLOCK);
+	res = 0;
+	if (td->td_sleeptimo != 0) {
+		if (td->td_sleeptimo <= sbinuptime())
+			res = EWOULDBLOCK;
+		td->td_sleeptimo = 0;
 	}
-
-	/*
-	 * If TDF_TIMOFAIL is set, the timeout ran after we had
-	 * already been woken up.
-	 */
-	if (td->td_flags & TDF_TIMOFAIL)
-		td->td_flags &= ~TDF_TIMOFAIL;
-
-	/*
-	 * If callout_stop() fails, then the timeout is running on
-	 * another CPU, so synchronize with it to avoid having it
-	 * accidentally wake up a subsequent sleep.
-	 */
-	else if (_callout_stop_safe(&td->td_slpcallout, CS_MIGRBLOCK)
-	    == 0) {
-		td->td_flags |= TDF_TIMEOUT;
-		TD_SET_SLEEPING(td);
-		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
-	}
-	return (0);
+	if (td->td_flags & TDF_TIMEOUT)
+		td->td_flags &= ~TDF_TIMEOUT;
+	else
+		/*
+		 * We ignore the situation where timeout subsystem was
+		 * unable to stop our callout.  The struct thread is
+		 * type-stable, the callout will use the correct
+		 * memory when running.  The checks of the
+		 * td_sleeptimo value in this function and in
+		 * sleepq_timeout() ensure that the thread does not
+		 * get spurious wakeups, even if the callout was reset
+		 * or thread reused.
+		 */
+		callout_stop(&td->td_slpcallout);
+	return (res);
 }
 
 /*
@@ -887,12 +893,17 @@ sleepq_timeout(void *arg)
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
-	/*
-	 * First, see if the thread is asleep and get the wait channel if
-	 * it is.
-	 */
 	thread_lock(td);
-	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
+
+	if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) {
+		/*
+		 * The thread does not want a timeout (yet).
+		 */
+	} else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
+		/*
+		 * See if the thread is asleep and get the wait
+		 * channel if it is.
+		 */
 		wchan = td->td_wchan;
 		sc = SC_LOOKUP(wchan);
 		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
@@ -900,40 +911,16 @@ sleepq_timeout(void *arg)
 		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
 		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
-		thread_unlock(td);
-		if (wakeup_swapper)
-			kick_proc0();
-		return;
-	}
-
-	/*
-	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
-	 * can either be on another CPU in between sleepq_add() and
-	 * one of the sleepq_*wait*() routines or it can be in
-	 * sleepq_catch_signals().
-	 */
-	if (TD_ON_SLEEPQ(td)) {
+	} else if (TD_ON_SLEEPQ(td)) {
+		/*
+		 * If the thread is on the SLEEPQ but isn't sleeping
+		 * yet, it can either be on another CPU in between
+		 * sleepq_add() and one of the sleepq_*wait*()
+		 * routines or it can be in sleepq_catch_signals().
+		 */
 		td->td_flags |= TDF_TIMEOUT;
-		thread_unlock(td);
-		return;
 	}
 
-	/*
-	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
-	 * then the other thread has already yielded to us, so clear
-	 * the flag and resume it.  If TDF_TIMEOUT is not set, then the
-	 * we know that the other thread is not on a sleep queue, but it
-	 * hasn't resumed execution yet.  In that case, set TDF_TIMOFAIL
-	 * to let it know that the timeout has already run and doesn't
-	 * need to be canceled.
-	 */
-	if (td->td_flags & TDF_TIMEOUT) {
-		MPASS(TD_IS_SLEEPING(td));
-		td->td_flags &= ~TDF_TIMEOUT;
-		TD_CLR_SLEEPING(td);
-		wakeup_swapper = setrunnable(td);
-	} else
-		td->td_flags |= TDF_TIMOFAIL;
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 033bbf0..96ec1d1 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -1089,7 +1089,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
-			if (asbt <= INT64_MAX - rsbt)
+			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
@@ -1626,7 +1626,7 @@ selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
-			if (asbt <= INT64_MAX - rsbt)
+			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index a290aae..761d8f3 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -382,6 +382,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
 	tcp_seq seq;
 	int error, ip_len, l;
 	uint16_t eh_type, tcp_data_len;
+	int force_flush = 0;
 
 	/* We expect a contiguous header [eh, ip, tcp]. */
 
@@ -448,8 +449,15 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
 	 * Check TCP header constraints.
 	 */
 	/* Ensure no bits set besides ACK or PSH. */
-	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
-		return (TCP_LRO_CANNOT);
+	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
+		if (th->th_flags & TH_SYN)
+			return (TCP_LRO_CANNOT);
+		/*
+		 * Make sure that previously seen segements/ACKs are delivered
+		 * before this segement, e.g. FIN.
+		 */
+		force_flush = 1;
+	}
 
 	/* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
 	/* XXX-BZ Ideally we'd flush on PUSH? */
@@ -465,8 +473,13 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
 	ts_ptr = (uint32_t *)(th + 1);
 	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
 	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
-	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
-		return (TCP_LRO_CANNOT);
+	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
+		/*
+		 * Make sure that previously seen segements/ACKs are delivered
+		 * before this segement.
+		 */
+		force_flush = 1;
+	}
 
 	/* If the driver did not pass in the checksum, set it now. */
 	if (csum == 0x0000)
@@ -500,6 +513,13 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
 #endif
 		}
 
+		if (force_flush) {
+			/* Timestamps mismatch; this is a FIN, etc */
+			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+			tcp_lro_flush(lc, le);
+			return (TCP_LRO_CANNOT);
+		}
+
 		/* Flush now if appending will result in overflow. */
 		if (le->p_len > (65535 - tcp_data_len)) {
 			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
@@ -568,6 +588,14 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
 		return (0);
 	}
 
+	if (force_flush) {
+		/*
+		 * Nothing to flush, but this segment can not be further
+		 * aggregated/delayed.
+		 */
+		return (TCP_LRO_CANNOT);
+	}
+
 	/* Try to find an empty slot. */
 	if (SLIST_EMPTY(&lc->lro_free))
 		return (TCP_LRO_NO_ENTRIES);
diff --git a/sys/ofed/drivers/infiniband/core/ucma.c b/sys/ofed/drivers/infiniband/core/ucma.c
index 23cbf7b..3c7c751 100644
--- a/sys/ofed/drivers/infiniband/core/ucma.c
+++ b/sys/ofed/drivers/infiniband/core/ucma.c
@@ -39,6 +39,8 @@
 #include <linux/in6.h>
 #include <linux/miscdevice.h>
 
+#include <sys/filio.h>
+
 #include <rdma/rdma_user_cm.h>
 #include <rdma/ib_marshall.h>
 #include <rdma/rdma_cm.h>
@@ -1285,11 +1287,25 @@ static int ucma_close(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static long
+ucma_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+
+	switch (cmd) {
+	case FIONBIO:
+	case FIOASYNC:
+		return (0);
+	default:
+		return (-ENOTTY);
+	}
+}
+
 static const struct file_operations ucma_fops = {
 	.owner 	 = THIS_MODULE,
 	.open 	 = ucma_open,
 	.release = ucma_close,
 	.write	 = ucma_write,
+	.unlocked_ioctl = ucma_ioctl,
 	.poll    = ucma_poll,
 };
 
diff --git a/sys/sys/callout.h b/sys/sys/callout.h
index d3f2bca..4e86b16 100644
--- a/sys/sys/callout.h
+++ b/sys/sys/callout.h
@@ -57,6 +57,7 @@
 #define	C_PRELGET(x)		(int)((((x) >> 1) & C_PRELRANGE) - 1)
 #define	C_HARDCLOCK		0x0100 /* align to hardclock() calls */
 #define	C_ABSOLUTE		0x0200 /* event time is absolute. */
+#define	C_PRECALC		0x0400 /* event time is pre-calculated. */
 
 struct callout_handle {
 	struct callout *callout;
@@ -129,6 +130,8 @@ int	callout_schedule_on(struct callout *, int, int);
 int	_callout_stop_safe(struct callout *, int);
 void	callout_process(sbintime_t now);
 
+void callout_when(sbintime_t sbt, sbintime_t precision, int flags,
+    sbintime_t *sbt_res, sbintime_t *prec_res);
 #endif
 
 #endif /* _SYS_CALLOUT_H_ */
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 07f69c6..4bd9584 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -58,7 +58,7 @@
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1003506	/* Master, propagated to newvers */
+#define __FreeBSD_version 1003507	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 59c75c5..6b0c924 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -325,6 +325,7 @@ struct thread {
 	u_int		td_dbg_sc_code;	/* (c) Syscall code to debugger. */
 	u_int		td_dbg_sc_narg;	/* (c) Syscall arg count to debugger.*/
 	void		*td_emuldata;	/* Emulator state data */
+	sbintime_t	td_sleeptimo;	/* (t) Sleep timeout. */
 };
 
 struct mtx *thread_lock_block(struct thread *);
@@ -364,7 +365,7 @@ do {									\
 #define	TDF_ALLPROCSUSP	0x00000200 /* suspended by SINGLE_ALLPROC */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
-#define	TDF_TIMOFAIL	0x00001000 /* Timeout from sleep after we were awake. */
+#define	TDF_UNUSED12	0x00001000 /* --available-- */
 #define	TDF_SBDRY	0x00002000 /* Stop only on usermode boundary. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
 #define	TDF_NEEDSUSPCHK	0x00008000 /* Thread may need to suspend. */
diff --git a/sys/sys/time.h b/sys/sys/time.h
index b589527..1f0a530 100644
--- a/sys/sys/time.h
+++ b/sys/sys/time.h
@@ -129,7 +129,7 @@ bintime_shift(struct bintime *_bt, int _exp)
 #define	SBT_1MS	(SBT_1S / 1000)
 #define	SBT_1US	(SBT_1S / 1000000)
 #define	SBT_1NS	(SBT_1S / 1000000000)
-#define	SBT_MAX 0x7fffffffffffffff
+#define	SBT_MAX	0x7fffffffffffffffLL
 
 static __inline int
 sbintime_getsec(sbintime_t _sbt)
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index c9c1271..233bb99 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -1399,15 +1399,13 @@ relock_queue:
 	/*
 	 * Scan the active queue for pages that can be deactivated.  Update
 	 * the per-page activity counter and use it to identify deactivation
-	 * candidates.
+	 * candidates.  Held pages may be deactivated.
 	 */
 	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
 	    min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
 	    scanned++) {
-
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_scan: page %p isn't active", m));
-
 		next = TAILQ_NEXT(m, plinks.q);
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
@@ -1421,8 +1419,8 @@ relock_queue:
 		}
 
 		/*
-		 * The count for pagedaemon pages is done after checking the
-		 * page for eligibility...
+		 * The count for page daemon pages is updated after checking
+		 * the page for eligibility.
 		 */
 		PCPU_INC(cnt.v_pdpages);
 
@@ -1435,12 +1433,17 @@ relock_queue:
 			act_delta += 1;
 		}
 		/*
-		 * Unlocked object ref count check.  Two races are possible.
-		 * 1) The ref was transitioning to zero and we saw non-zero,
-		 *    the pmap bits will be checked unnecessarily.
-		 * 2) The ref was transitioning to one and we saw zero. 
-		 *    The page lock prevents a new reference to this page so
-		 *    we need not check the reference bits.
+		 * Perform an unsynchronized object ref count check.  While
+		 * the page lock ensures that the page is not reallocated to
+		 * another object, in particular, one with unmanaged mappings
+		 * that cannot support pmap_ts_referenced(), two races are,
+		 * nonetheless, possible:
+		 * 1) The count was transitioning to zero, but we saw a non-
+		 *    zero value.  pmap_ts_referenced() will return zero
+		 *    because the page is not mapped.
+		 * 2) The count was transitioning to one, but we saw zero. 
+		 *    This race delays the detection of a new reference.  At
+		 *    worst, we will deactivate and reactivate the page.
 		 */
 		if (m->object->ref_count != 0)
 			act_delta += pmap_ts_referenced(m);