Merge remote-tracking branch 'origin/stable/10' into devel

author: Renato Botelho <renato@netgate.com> 2016-08-25 10:41:37 -0300
committer: Renato Botelho <renato@netgate.com> 2016-08-25 10:41:37 -0300
commit: 29ebd1247162a77db08e5e2e00d033220ec807fe (patch)
tree: d45bd4c2da327a132f18b6f39db36fe188c4e029
parent: 75cd8d40056c799f03b759475d9bfd10ba266a6c (diff)
parent: c29dc2b4296960868edafe94ebf975be284200bb (diff)
download: FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.zip
FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.tar.gz
86 files changed, 8256 insertions, 5092 deletions
diff --git a/contrib/binutils/bfd/elfxx-mips.c b/contrib/binutils/bfd/elfxx-mips.c
index 8f4b05d..1a49b0c 100644
--- a/contrib/binutils/bfd/elfxx-mips.c
+++ b/contrib/binutils/bfd/elfxx-mips.c
@@ -4801,7 +4801,7 @@ mips_elf_create_dynamic_relocation (bfd *output_bfd,
   /* We must now calculate the dynamic symbol table index to use
      in the relocation.  */
   if (h != NULL
-      && (!h->root.def_regular
+      && (sec == NULL || !h->root.def_regular
 	  || (info->shared && !info->symbolic && !h->root.forced_local)))
     {
       indx = h->root.dynindx;
diff --git a/lib/libc/Versions.def b/lib/libc/Versions.def
index 8452c7d..e348308 100644
--- a/lib/libc/Versions.def
+++ b/lib/libc/Versions.def
@@ -27,6 +27,10 @@ FBSD_1.3 {
 FBSD_1.4 {
 } FBSD_1.3;
 
+# This version was first added to 12.0-current.
+FBSD_1.5 {
+} FBSD_1.4;
+
 
 # This is our private namespace.  Any global interfaces that are
 # strictly for use only by other FreeBSD applications and libraries
@@ -35,4 +39,4 @@ FBSD_1.4 {
 #
 # Please do NOT increment the version of this namespace.
 FBSDprivate_1.0 {
-} FBSD_1.4;
+} FBSD_1.5;
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h
index 6e2ff66..c219f55 100644
--- a/lib/libc/include/libc_private.h
+++ b/lib/libc/include/libc_private.h
@@ -264,6 +264,12 @@ extern const char *__progname;
 void _malloc_thread_cleanup(void);
 
 /*
+ * This function is used by the threading libraries to notify libc that a
+ * thread is exiting, so its thread-local dtors should be called.
+ */
+void __cxa_thread_call_dtors(void);
+
+/*
  * These functions are used by the threading libraries in order to protect
  * malloc across fork().
  */
diff --git a/lib/libc/stdio/fgetln.c b/lib/libc/stdio/fgetln.c
index 1779de2..6546768 100644
--- a/lib/libc/stdio/fgetln.c
+++ b/lib/libc/stdio/fgetln.c
@@ -159,6 +159,7 @@ fgetln(FILE *fp, size_t *lenp)
 
 error:
 	*lenp = 0;		/* ??? */
+	fp->_flags |= __SERR;
 	FUNLOCKFILE(fp);
 	return (NULL);		/* ??? */
 }
diff --git a/lib/libc/stdlib/Makefile.inc b/lib/libc/stdlib/Makefile.inc
index 75204f5..9bffd7e 100644
--- a/lib/libc/stdlib/Makefile.inc
+++ b/lib/libc/stdlib/Makefile.inc
@@ -5,7 +5,7 @@
 .PATH: ${.CURDIR}/${LIBC_ARCH}/stdlib ${.CURDIR}/stdlib
 
 MISRCS+=_Exit.c a64l.c abort.c abs.c atexit.c atof.c atoi.c atol.c atoll.c \
-	bsearch.c div.c exit.c getenv.c getopt.c getopt_long.c \
+	bsearch.c cxa_thread_atexit.c div.c exit.c getenv.c getopt.c getopt_long.c \
 	getsubopt.c hcreate.c heapsort.c imaxabs.c imaxdiv.c \
 	insque.c l64a.c labs.c ldiv.c llabs.c lldiv.c lsearch.c \
 	merge.c ptsname.c qsort.c qsort_r.c quick_exit.c radixsort.c rand.c \
diff --git a/lib/libc/stdlib/Symbol.map b/lib/libc/stdlib/Symbol.map
index 4b8ef5b..f6b62a9 100644
--- a/lib/libc/stdlib/Symbol.map
+++ b/lib/libc/stdlib/Symbol.map
@@ -104,8 +104,13 @@ FBSD_1.3 {
 	strtouq_l;
 };
 
+FBSD_1.5 {
+	__cxa_thread_atexit;
+};
+
 FBSDprivate_1.0 {
 	__system;
 	_system;
 	__libc_system;
+	__cxa_thread_call_dtors;
 };
diff --git a/lib/libc/stdlib/cxa_thread_atexit.c b/lib/libc/stdlib/cxa_thread_atexit.c
new file mode 100644
index 0000000..c966731
--- /dev/null
+++ b/lib/libc/stdlib/cxa_thread_atexit.c
@@ -0,0 +1,140 @@
+/*-
+ * Copyright (c) 2016 Mahdi Mokhtari <mokhi64@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+#include "namespace.h"
+#include <errno.h>
+#include <link.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "un-namespace.h"
+#include "libc_private.h"
+
+/*
+ * C++11 introduces the thread_local scope (like __thread with some
+ * additions).  As a key-feature it should support non-trivial
+ * destructors, registered with __cxa_thread_atexit() to be executed
+ * at the thread termination.
+ *
+ * The implemention keeps a _Thread_local list of destructors per each
+ * thread, and calls __cxa_thread_call_dtors() on each thread's exit
+ * to do cleanup.  For a thread calling exit(3), in particular, for
+ * the initial thread returning from main(), we call
+ * __cxa_thread_call_dtors() inside exit().
+ *
+ * It could be possible that a dynamically loaded library, use
+ * thread_local variable but is dlclose()'d before thread exit.  The
+ * destructor of this variable will then try to access the address,
+ * for calling it but it's unloaded, so it'll crash.  We're using
+ * __elf_phdr_match_addr() to detect and prevent such cases and so
+ * prevent the crash.
+ */
+
+#define CXA_DTORS_ITERATIONS 4
+
+struct cxa_thread_dtor {
+	void *obj;
+	void (*func)(void *);
+	void *dso;
+	LIST_ENTRY(cxa_thread_dtor) entry;
+};
+static _Thread_local LIST_HEAD(dtor_list, cxa_thread_dtor) dtors =
+    LIST_HEAD_INITIALIZER(dtors);
+
+int
+__cxa_thread_atexit(void (*dtor_func)(void *), void *obj, void *dso_symbol)
+{
+	struct cxa_thread_dtor *new_dtor;
+
+	new_dtor = malloc(sizeof(*new_dtor));
+	if (new_dtor == NULL) {
+		errno = ENOMEM; /* forcibly override malloc(3) error */
+		return (-1);
+	}
+
+	new_dtor->obj = obj;
+	new_dtor->func = dtor_func;
+	new_dtor->dso = dso_symbol;
+	LIST_INSERT_HEAD(&dtors, new_dtor, entry);
+	return (0);
+}
+
+static void
+walk_cb_call(struct cxa_thread_dtor *dtor)
+{
+	struct dl_phdr_info phdr_info;
+
+	if (_rtld_addr_phdr(dtor->dso, &phdr_info) &&
+	    __elf_phdr_match_addr(&phdr_info, dtor->func))
+		dtor->func(dtor->obj);
+	else
+		fprintf(stderr, "__cxa_thread_call_dtors: dtr %p from "
+		    "unloaded dso, skipping\n", (void *)(dtor->func));
+}
+
+static void
+walk_cb_nocall(struct cxa_thread_dtor *dtor __unused)
+{
+}
+
+static void
+cxa_thread_walk(void (*cb)(struct cxa_thread_dtor *))
+{
+	struct cxa_thread_dtor *dtor, *tdtor;
+
+	LIST_FOREACH_SAFE(dtor, &dtors, entry, tdtor) {
+		LIST_REMOVE(dtor, entry);
+		cb(dtor);
+		free(dtor);
+	}
+}
+
+/*
+ * This is the callback function we use to call destructors, once for
+ * each thread.  It is called in exit(3) in libc/stdlib/exit.c and
+ * before exit_thread() in libthr/thread/thr_exit.c.
+ */
+void
+__cxa_thread_call_dtors(void)
+{
+	int i;
+
+	for (i = 0; i < CXA_DTORS_ITERATIONS && !LIST_EMPTY(&dtors); i++)
+		cxa_thread_walk(walk_cb_call);
+
+	if (!LIST_EMPTY(&dtors)) {
+		fprintf(stderr, "Thread %p is exiting with more "
+		    "thread-specific dtors created after %d iterations "
+		    "of destructor calls\n",
+		    _pthread_self(), i);
+		cxa_thread_walk(walk_cb_nocall);
+	}
+}
diff --git a/lib/libc/stdlib/exit.c b/lib/libc/stdlib/exit.c
index 145eb9d..b8afede 100644
--- a/lib/libc/stdlib/exit.c
+++ b/lib/libc/stdlib/exit.c
@@ -64,6 +64,12 @@ exit(status)
 
 	_thread_autoinit_dummy_decl = 1;
 
+	/*
+	 * We're dealing with cleaning up thread_local destructors in the case of
+	 * the process termination through main() exit.
+	 * Other cases are handled elsewhere.
+	 */
+	__cxa_thread_call_dtors();
 	__cxa_finalize(NULL);
 	if (__cleanup)
 		(*__cleanup)();
diff --git a/lib/libc/sys/aio_fsync.2 b/lib/libc/sys/aio_fsync.2
index 7eb1a3b..b271dbf 100644
--- a/lib/libc/sys/aio_fsync.2
+++ b/lib/libc/sys/aio_fsync.2
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 15, 2016
+.Dd August 19, 2016
 .Dt AIO_FSYNC 2
 .Os
 .Sh NAME
@@ -74,16 +74,14 @@ the call returns without having enqueued the request.
 .Pp
 The
 .Fa iocb->aio_sigevent
-structure can be used to request notification of the request's
+structure can be used to request notification of the operation's
 completion as described in
 .Xr aio 4 .
 .Sh RESTRICTIONS
-The asynchronous I/O Control Block structure pointed to by
+The Asynchronous I/O Control Block structure pointed to by
 .Fa iocb
 must remain valid until the
 operation has completed.
-For this reason, use of auto (stack) variables
-for these objects is discouraged.
 .Pp
 The asynchronous I/O control buffer
 .Fa iocb
@@ -91,9 +89,8 @@ should be zeroed before the
 .Fn aio_fsync
 call to avoid passing bogus context information to the kernel.
 .Pp
-Modifications of the Asynchronous I/O Control Block structure or the
-buffer contents after the request has been enqueued, but before the
-request has completed, are not allowed.
+Modification of the Asynchronous I/O Control Block structure is not allowed
+while the request is queued.
 .Sh RETURN VALUES
 .Rv -std aio_fsync
 .Sh ERRORS
diff --git a/lib/libc/sys/aio_mlock.2 b/lib/libc/sys/aio_mlock.2
index 03e2df7..76f67ef 100644
--- a/lib/libc/sys/aio_mlock.2
+++ b/lib/libc/sys/aio_mlock.2
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 15, 2016
+.Dd August 19, 2016
 .Dt AIO_MLOCK 2
 .Os
 .Sh NAME
@@ -67,7 +67,7 @@ then the call returns without having enqueued the request.
 .Pp
 The
 .Fa iocb->aio_sigevent
-structure can be used to request notification of the request's
+structure can be used to request notification of the operation's
 completion as described in
 .Xr aio 4 .
 .Sh RESTRICTIONS
@@ -77,8 +77,6 @@ and the buffer that the
 .Fa iocb->aio_buf
 member of that structure references must remain valid until the
 operation has completed.
-For this reason, use of auto (stack) variables
-for these objects is discouraged.
 .Pp
 The asynchronous I/O control buffer
 .Fa iocb
@@ -87,8 +85,8 @@ should be zeroed before the
 call to avoid passing bogus context information to the kernel.
 .Pp
 Modifications of the Asynchronous I/O Control Block structure or the
-buffer contents after the request has been enqueued, but before the
-request has completed, are not allowed.
+memory mapping described by the virtual address range are not allowed
+while the request is queued.
 .Sh RETURN VALUES
 .Rv -std aio_mlock
 .Sh ERRORS
diff --git a/lib/libc/sys/aio_read.2 b/lib/libc/sys/aio_read.2
index 69960d5..96d84cc 100644
--- a/lib/libc/sys/aio_read.2
+++ b/lib/libc/sys/aio_read.2
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 15, 2016
+.Dd August 19, 2016
 .Dt AIO_READ 2
 .Os
 .Sh NAME
@@ -82,7 +82,7 @@ not be referenced after the request is enqueued.
 .Pp
 The
 .Fa iocb->aio_sigevent
-structure can be used to request notification of the request's
+structure can be used to request notification of the operation's
 completion as described in
 .Xr aio 4 .
 .Sh RESTRICTIONS
@@ -92,8 +92,6 @@ and the buffer that the
 .Fa iocb->aio_buf
 member of that structure references must remain valid until the
 operation has completed.
-For this reason, use of auto (stack) variables
-for these objects is discouraged.
 .Pp
 The asynchronous I/O control buffer
 .Fa iocb
@@ -102,8 +100,7 @@ should be zeroed before the
 call to avoid passing bogus context information to the kernel.
 .Pp
 Modifications of the Asynchronous I/O Control Block structure or the
-buffer contents after the request has been enqueued, but before the
-request has completed, are not allowed.
+buffer contents are not allowed while the request is queued.
 .Pp
 If the file offset in
 .Fa iocb->aio_offset
diff --git a/lib/libc/sys/aio_write.2 b/lib/libc/sys/aio_write.2
index 076ce50..2148913 100644
--- a/lib/libc/sys/aio_write.2
+++ b/lib/libc/sys/aio_write.2
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 15, 2016
+.Dd August 19, 2016
 .Dt AIO_WRITE 2
 .Os
 .Sh NAME
@@ -88,7 +88,7 @@ be referenced after the request is enqueued.
 .Pp
 The
 .Fa iocb->aio_sigevent
-structure can be used to request notification of the request's
+structure can be used to request notification of the operation's
 completion as described in
 .Xr aio 4 .
 .Sh RESTRICTIONS
@@ -98,8 +98,6 @@ and the buffer that the
 .Fa iocb->aio_buf
 member of that structure references must remain valid until the
 operation has completed.
-For this reason, use of auto (stack) variables
-for these objects is discouraged.
 .Pp
 The asynchronous I/O control buffer
 .Fa iocb
@@ -108,8 +106,7 @@ should be zeroed before the
 system call to avoid passing bogus context information to the kernel.
 .Pp
 Modifications of the Asynchronous I/O Control Block structure or the
-buffer contents after the request has been enqueued, but before the
-request has completed, are not allowed.
+buffer contents are not allowed while the request is queued.
 .Pp
 If the file offset in
 .Fa iocb->aio_offset
diff --git a/lib/libc/sys/ptrace.2 b/lib/libc/sys/ptrace.2
index fc5c0ea..0e8c8bd 100644
--- a/lib/libc/sys/ptrace.2
+++ b/lib/libc/sys/ptrace.2
@@ -151,6 +151,11 @@ The process ID of the new child process will also be present in the
 .Va pl_child_pid
 member of
 .Vt "struct ptrace_lwpinfo" .
+If the new child process was created via
+.Xr vfork 2 ,
+the traced process's stop will also include the
+.Dv PL_FLAG_VFORKED
+flag.
 Note that new child processes will be attached with the default
 tracing event mask;
 they do not inherit the event mask of the traced process.
@@ -173,6 +178,33 @@ Note that new processes do not report an event for the creation of their
 initial thread,
 and exiting processes do not report an event for the termination of the
 last thread.
+.It Dv PTRACE_VFORK
+Report a stop event when a parent process resumes after a
+.Xr vfork 2 .
+.Pp
+When a thread in the traced process creates a new child process via
+.Xr vfork 2 ,
+the stop that reports
+.Dv PL_FLAG_FORKED
+and
+.Dv PL_FLAG_SCX
+occurs just after the child process is created,
+but before the thread waits for the child process to stop sharing process
+memory.
+If a debugger is not tracing the new child process,
+it must ensure that no breakpoints are enabled in the shared process
+memory before detaching from the new child process.
+This means that no breakpoints are enabled in the parent process either.
+.Pp
+The
+.Dv PTRACE_VFORK
+flag enables a new stop that indicates when the new child process stops
+sharing the process memory of the parent process.
+A debugger can reinsert breakpoints in the parent process and resume it
+in response to this event.
+This event is indicated by setting the
+.Dv PL_FLAG_VFORK_DONE
+flag.
 .El
 .Pp
 The default tracing event mask when attaching to a process via
@@ -501,6 +533,16 @@ is enabled.
 Note that this event is not reported when the last LWP in a process exits.
 The termination of the last thread is reported via a normal process exit
 event.
+.It PL_FLAG_VFORKED
+Indicates that the thread is returning from a call to
+.Xr vfork 2
+that created a new child process.
+This flag is set in addition to
+.Dv PL_FLAG_FORKED .
+.It PL_FLAG_VFORK_DONE
+Indicates that the thread has resumed after a child process created via
+.Xr vfork 2
+has stopped sharing its address space with the traced process.
 .El
 .It pl_sigmask
 The current signal mask of the LWP
diff --git a/lib/libc/tests/stdlib/Makefile b/lib/libc/tests/stdlib/Makefile
index 2f79858..15d8cbc 100644
--- a/lib/libc/tests/stdlib/Makefile
+++ b/lib/libc/tests/stdlib/Makefile
@@ -1,8 +1,14 @@
 # $FreeBSD$
 
+.include <bsd.own.mk>
+
 ATF_TESTS_C+=		heapsort_test
 ATF_TESTS_C+=		mergesort_test
 ATF_TESTS_C+=		qsort_test
+.if ${COMPILER_FEATURES:Mc++11}
+ATF_TESTS_CXX+=		cxa_thread_atexit_test
+ATF_TESTS_CXX+=		cxa_thread_atexit_nothr_test
+.endif
 
 TESTSDIR=	${TESTSBASE}/lib/libc/stdlib
 
@@ -34,6 +40,11 @@ PROGS+=		h_getopt h_getopt_long
 
 CFLAGS+=	-I${.CURDIR}
 
+CXXFLAGS.cxa_thread_atexit_test+=	-std=c++11
+CXXFLAGS.cxa_thread_atexit_nothr_test+=	-std=c++11
+DPADD.cxa_thread_atexit_test+=		${LIBPTHREAD}
+LDADD.cxa_thread_atexit_test+=		-lpthread
+
 .for t in h_getopt h_getopt_long
 CFLAGS.$t+=	-I${LIBNETBSD_SRCDIR} -I${SRCTOP}/contrib/netbsd-tests
 LDFLAGS.$t+=	-L${LIBNETBSD_OBJDIR}
diff --git a/lib/libc/tests/stdlib/cxa_thread_atexit_nothr_test.cc b/lib/libc/tests/stdlib/cxa_thread_atexit_nothr_test.cc
new file mode 100644
index 0000000..3ac3602
--- /dev/null
+++ b/lib/libc/tests/stdlib/cxa_thread_atexit_nothr_test.cc
@@ -0,0 +1,102 @@
+/*-
+ * Copyright (c) 2016 Mahdi Mokhtari <mokhi64@gmail.com>
+ * Copyright (c) 2016 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <dlfcn.h>
+#include <atf-c++.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+static FILE *output = NULL;
+
+struct Foo {
+	Foo() { ATF_REQUIRE(fprintf(output, "Created\n") > 0); }
+	~Foo() { ATF_REQUIRE(fprintf(output, "Destroyed\n") > 0); }
+	void use() { ATF_REQUIRE(fprintf(output, "Used\n") > 0); }
+};
+
+static thread_local Foo f;
+
+/*
+ * This test must not be linked to libpthread.
+ */
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__nothr);
+ATF_TEST_CASE_BODY(cxx__nothr)
+{
+	void *libthr_handle;
+
+	/* Avoid coredump during f construction. */
+	output = stderr;
+
+	libthr_handle = dlopen("libthr.so.3", RTLD_LAZY | RTLD_GLOBAL |
+	    RTLD_NOLOAD);
+	ATF_REQUIRE(libthr_handle == NULL);
+}
+
+static void
+check_local_main(void)
+{
+	static const char out_log[] = "Created\nUsed\nDestroyed\n";
+
+	fflush(output);
+	ATF_REQUIRE(atf::utils::compare_file("test_main.txt", out_log));
+}
+
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__thread_local_main);
+ATF_TEST_CASE_BODY(cxx__thread_local_main)
+{
+
+	ATF_REQUIRE((output = fopen("test_main.txt", "w")) != NULL);
+	f.use();
+	atexit(check_local_main);
+}
+
+extern "C" int __cxa_thread_atexit(void (*)(void *), void *, void *);
+
+static void
+again(void *arg)
+{
+
+	__cxa_thread_atexit(again, arg, &output);
+}
+
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__thread_inf_dtors);
+ATF_TEST_CASE_BODY(cxx__thread_inf_dtors)
+{
+
+	again(NULL);
+}
+
+ATF_INIT_TEST_CASES(tcs)
+{
+
+	ATF_ADD_TEST_CASE(tcs, cxx__nothr);
+	ATF_ADD_TEST_CASE(tcs, cxx__thread_local_main);
+	ATF_ADD_TEST_CASE(tcs, cxx__thread_inf_dtors);
+}
diff --git a/lib/libc/tests/stdlib/cxa_thread_atexit_test.cc b/lib/libc/tests/stdlib/cxa_thread_atexit_test.cc
new file mode 100644
index 0000000..ded91c9
--- /dev/null
+++ b/lib/libc/tests/stdlib/cxa_thread_atexit_test.cc
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (c) 2016 Mahdi Mokhtari <mokhi64@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <dlfcn.h>
+#include <atf-c++.hpp>
+#include <cstdio>
+#include <cstdlib>
+#include <thread>
+
+static FILE *output = NULL;
+
+struct Foo {
+	Foo() { ATF_REQUIRE(fprintf(output, "Created\n") > 0); }
+	~Foo() { ATF_REQUIRE(fprintf(output, "Destroyed\n") > 0); }
+	void use() { ATF_REQUIRE(fprintf(output, "Used\n") > 0); }
+};
+
+struct Bar {
+	Bar() {}
+	~Bar() {
+		thread_local static Foo foo;
+		ATF_REQUIRE(fprintf(output, "DIED\n") > 0);
+	}
+	void use() {}
+};
+
+extern "C" int __cxa_thread_atexit(void (*)(void *), void *, void *);
+
+static void
+again(void *arg)
+{
+
+	__cxa_thread_atexit(again, arg, &output);
+}
+
+struct Baz {
+	Baz() {}
+	~Baz() {
+		again(NULL);
+	}
+	void use() {}
+};
+
+static thread_local Foo f;
+static thread_local Foo g;
+static thread_local Bar h;
+static thread_local Baz e;
+
+/*
+ * This test must be linked to libpthread.
+ */
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__thr);
+ATF_TEST_CASE_BODY(cxx__thr)
+{
+	void *libthr_handle;
+
+	/* Avoid coredump during f construction. */
+	output = stderr;
+
+	libthr_handle = dlopen("libthr.so.3", RTLD_LAZY | RTLD_GLOBAL |
+	    RTLD_NOLOAD);
+	ATF_REQUIRE(libthr_handle != NULL);
+	dlclose(libthr_handle);
+}
+
+/*
+ * In this test f.use() will test cxa_thread_atexit() in non-threaded mode.
+ * After f.use() main will be threaded and we'll have one additional thread
+ * with its own TLS data.
+ */
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__thread_local_before);
+ATF_TEST_CASE_BODY(cxx__thread_local_before)
+{
+	static const char out_log[] = "Created\nCreated\nUsed\nCreated\n"
+	    "Created\nUsed\nCreated\nDIED\nDestroyed\nDestroyed\nDestroyed\n";
+
+	ATF_REQUIRE((output = fopen("test_before.txt", "w")) != NULL);
+
+	f.use();
+	std::thread t([]() { f.use(); });
+	t.join();
+
+	fflush(output);
+
+	ATF_REQUIRE(atf::utils::compare_file("test_before.txt", out_log));
+}
+
+/*
+ * In this test, f.use() will test __cxa_thread_atexit()
+ * in threaded mode (but still in main-threaed).
+ */
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__thread_local_after);
+ATF_TEST_CASE_BODY(cxx__thread_local_after)
+{
+	static const char out_log[] = "Created\nCreated\nUsed\nCreated\n"
+	    "DIED\nDestroyed\nDestroyed\nDestroyed\nCreated\nCreated\nUsed\n";
+
+	ATF_REQUIRE((output = fopen("test_after.txt", "w")) != NULL);
+
+	std::thread t([]() { g.use(); });
+	t.join();
+	sleep(1);
+	g.use();
+
+	fflush(output);
+
+	ATF_REQUIRE(atf::utils::compare_file("test_after.txt", out_log));
+}
+
+/*
+ * In this test, we register a new dtor while dtors are being run
+ * in __cxa_thread_atexit().
+ */
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__thread_local_add_while_calling_dtors);
+ATF_TEST_CASE_BODY(cxx__thread_local_add_while_calling_dtors)
+{
+	static const char out_log[] = "Created\nCreated\nCreated\nDIED\n"
+	    "Destroyed\nDestroyed\nDestroyed\n";
+
+	ATF_REQUIRE((output = fopen("test_add_meanwhile.txt", "w")) != NULL);
+
+	std::thread t([]() { h.use(); });
+	t.join();
+	sleep(1);
+
+	fflush(output);
+
+	ATF_REQUIRE(atf::utils::compare_file("test_add_meanwhile.txt", out_log));
+}
+
+ATF_TEST_CASE_WITHOUT_HEAD(cxx__thread_inf_dtors);
+ATF_TEST_CASE_BODY(cxx__thread_inf_dtors)
+{
+
+	/*
+	 * Only added to make isolated run of this test not
+	 * coredumping.  Construction of Foo objects require filled
+	 * output.
+	 */
+	output = stderr;
+
+	std::thread t([]() { e.use(); });
+	t.join();
+}
+
+ATF_INIT_TEST_CASES(tcs)
+{
+
+	ATF_ADD_TEST_CASE(tcs, cxx__thr);
+	ATF_ADD_TEST_CASE(tcs, cxx__thread_local_before);
+	ATF_ADD_TEST_CASE(tcs, cxx__thread_local_after);
+	ATF_ADD_TEST_CASE(tcs, cxx__thread_local_add_while_calling_dtors);
+	ATF_ADD_TEST_CASE(tcs, cxx__thread_inf_dtors);
+}
diff --git a/lib/libthr/thread/thr_exit.c b/lib/libthr/thread/thr_exit.c
index 7001311..383aafa 100644
--- a/lib/libthr/thread/thr_exit.c
+++ b/lib/libthr/thread/thr_exit.c
@@ -151,8 +151,12 @@ thread_unwind_stop(int version, _Unwind_Action actions,
 		__pthread_cleanup_pop_imp(1);
 	}
 
-	if (done)
+	if (done) {
+		/* Tell libc that it should call non-trivial TLS dtors. */
+		__cxa_thread_call_dtors();
+
 		exit_thread(); /* Never return! */
+	}
 
 	return (_URC_NO_REASON);
 }
@@ -246,6 +250,8 @@ cleanup:
 		while (curthread->cleanup != NULL) {
 			__pthread_cleanup_pop_imp(1);
 		}
+		__cxa_thread_call_dtors();
+
 		exit_thread();
 	}
 
@@ -253,6 +259,7 @@ cleanup:
 	while (curthread->cleanup != NULL) {
 		__pthread_cleanup_pop_imp(1);
 	}
+	__cxa_thread_call_dtors();
 
 	exit_thread();
 #endif /* _PTHREAD_FORCED_UNWIND */
diff --git a/libexec/rtld-elf/rtld.c b/libexec/rtld-elf/rtld.c
index 8d0938b..0b72f85 100644
--- a/libexec/rtld-elf/rtld.c
+++ b/libexec/rtld-elf/rtld.c
@@ -1872,6 +1872,7 @@ static void
 init_rtld(caddr_t mapbase, Elf_Auxinfo **aux_info)
 {
     Obj_Entry objtmp;	/* Temporary rtld object */
+    const Elf_Ehdr *ehdr;
     const Elf_Dyn *dyn_rpath;
     const Elf_Dyn *dyn_soname;
     const Elf_Dyn *dyn_runpath;
@@ -1910,6 +1911,9 @@ init_rtld(caddr_t mapbase, Elf_Auxinfo **aux_info)
 
 	relocate_objects(&objtmp, true, &objtmp, 0, NULL);
     }
+    ehdr = (Elf_Ehdr *)mapbase;
+    objtmp.phdr = (Elf_Phdr *)((char *)mapbase + ehdr->e_phoff);
+    objtmp.phsize = ehdr->e_phnum * sizeof(objtmp.phdr[0]);
 
     /* Initialize the object list. */
     TAILQ_INIT(&obj_list);
@@ -2120,8 +2124,7 @@ load_needed_objects(Obj_Entry *first, int flags)
 {
     Obj_Entry *obj;
 
-    obj = first;
-    TAILQ_FOREACH_FROM(obj, &obj_list, next) {
+    for (obj = first; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 	if (obj->marker)
 	    continue;
 	if (process_needed(obj, obj->needed, flags) == -1)
@@ -2720,9 +2723,8 @@ relocate_objects(Obj_Entry *first, bool bind_now, Obj_Entry *rtldobj,
 	Obj_Entry *obj;
 	int error;
 
-	error = 0;
-	obj = first;
-	TAILQ_FOREACH_FROM(obj, &obj_list, next) {
+	for (error = 0, obj = first;  obj != NULL;
+	    obj = TAILQ_NEXT(obj, next)) {
 		if (obj->marker)
 			continue;
 		error = relocate_object(obj, bind_now, rtldobj, flags,
@@ -2762,8 +2764,7 @@ resolve_objects_ifunc(Obj_Entry *first, bool bind_now, int flags,
 {
 	Obj_Entry *obj;
 
-	obj = first;
-	TAILQ_FOREACH_FROM(obj, &obj_list, next) {
+	for (obj = first; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 		if (obj->marker)
 			continue;
 		if (resolve_object_ifunc(obj, bind_now, flags, lockstate) == -1)
@@ -4213,7 +4214,7 @@ trace_loaded_objects(Obj_Entry *obj)
 
     list_containers = getenv(LD_ "TRACE_LOADED_OBJECTS_ALL");
 
-    TAILQ_FOREACH_FROM(obj, &obj_list, next) {
+    for (; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 	Needed_Entry		*needed;
 	char			*name, *path;
 	bool			is_lib;
@@ -4557,8 +4558,7 @@ allocate_tls(Obj_Entry *objs, void *oldtls, size_t tcbsize, size_t tcbalign)
 	 */
 	free_tls(oldtls, 2*sizeof(Elf_Addr), sizeof(Elf_Addr));
     } else {
-	obj = objs;
-	TAILQ_FOREACH_FROM(obj, &obj_list, next) {
+	for (obj = objs; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 		if (obj->marker || obj->tlsoffset == 0)
 			continue;
 		addr = segbase - obj->tlsoffset;
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index 9d3684d..2dd3329 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -358,7 +358,9 @@ MAN=	aac.4 \
 	ng_vlan.4 \
 	nmdm.4 \
 	nsp.4 \
-	${_ntb.4} \
+	${_ntb_hw.4} \
+	${_ntb_transport.4} \
+	${_if_ntb.4} \
 	null.4 \
 	${_nvd.4} \
 	${_nve.4} \
@@ -656,8 +658,7 @@ MLINKS+=netintro.4 net.4 \
 	netintro.4 networking.4
 MLINKS+=${_nfe.4} ${_if_nfe.4}
 MLINKS+=nge.4 if_nge.4
-MLINKS+=${_ntb.4} ${_if_ntb.4} \
-	${_ntb.4} ${_ntb_hw.4}
+MLINKS+=${_ntb_hw.4} ${_ntb.4}
 MLINKS+=${_nve.4} ${_if_nve.4}
 MLINKS+=${_nxge.4} ${_if_nxge.4}
 MLINKS+=patm.4 if_patm.4
@@ -801,6 +802,7 @@ _if_ntb.4=	if_ntb.4
 _ioat.4=	ioat.4
 _ntb.4=		ntb.4
 _ntb_hw.4=	ntb_hw.4
+_ntb_transport.4=ntb_transport.4
 _qlxge.4=	qlxge.4
 _qlxgb.4=	qlxgb.4
 _qlxgbe.4=	qlxgbe.4
diff --git a/share/man/man4/if_ntb.4 b/share/man/man4/if_ntb.4
new file mode 100644
index 0000000..6f52d77
--- /dev/null
+++ b/share/man/man4/if_ntb.4
@@ -0,0 +1,89 @@
+.\"
+.\" Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd July 29, 2016
+.Dt IF_NTB 4
+.Os
+.Sh NAME
+.Nm if_ntb
+.Nd Virtual Ethernet interface for Non-Transparent Bridges
+.Sh SYNOPSIS
+To compile this driver into your kernel,
+place the following lines in your kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device if_ntb"
+.Ed
+.Pp
+Or, to load the driver as a module at boot, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+if_ntb_load="YES"
+.Ed
+.Pp
+The following tunables are settable from the
+.Xr loader 8 :
+.Bl -ohang
+.It Va hw.if_ntb.num_queues
+Number of transport queues to use per interface.
+Default is unlimited.
+.El
+.Sh DESCRIPTION
+The
+.Nm
+driver attaches on top of the
+.Xr ntb_transport 4
+driver to utilize its resources to create virtual Ethernet interface between
+the systems.
+Interface capabilities depend on the underlying transport.
+Typical MTU is about 64KB to reduce overhead.
+By default one queue is used, but more may be configured.
+The MAC address for interface is randomly generated.
+.Pp
+The
+.Nm
+driver does not implement any real hardware offload, but since PCIe link is
+protected by CRC32, in some situations it may be possible to save some CPU
+cycles by enabling fake checksum offload on both link sides via setting
+.Cm rxcsum
+and
+.Cm txcsum
+interface options.
+.Sh SEE ALSO
+.Xr ntb_transport 4
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+driver was developed by Intel and originally written by
+.An Carl Delsey Aq Mt carl@FreeBSD.org .
+Later improvements were done by
+.An Conrad E. Meyer Aq Mt cem@FreeBSD.org
+and
+.An Alexander Motin Aq Mt mav@FreeBSD.org .
+.Sh BUGS
+Linux supports only one queue per interface, so manual configuration
+may be required for compatibility.
diff --git a/share/man/man4/ntb_hw.4 b/share/man/man4/ntb_hw.4
new file mode 100644
index 0000000..b6dffce
--- /dev/null
+++ b/share/man/man4/ntb_hw.4
@@ -0,0 +1,117 @@
+.\"
+.\" Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd July 28, 2016
+.Dt NTB_HW 4
+.Os
+.Sh NAME
+.Nm ntb ,
+.Nm ntb_hw
+.Nd Intel(R) Non-Transparent Bridge driver
+.Sh SYNOPSIS
+To compile this driver into your kernel,
+place the following lines in your kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device ntb_hw"
+.Ed
+.Pp
+Or, to load the driver as a module at boot, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+ntb_hw_load="YES"
+.Ed
+.Pp
+The following tunables are settable from the
+.Xr loader 8 :
+.Bl -ohang
+.It Va hw.ntb.debug_level
+Driver debug level.
+The default value is 0, higher means more verbose.
+.It Va hint.ntb_hw. Ns Ar X Ns Va .config
+Configures NTB resources split between several consumer devices.
+Configuration of multiple consumer devices separated by commas.
+Each device can be configured as: "<name>[:<mw>[:<spad>[:<db>]]]", where:
+.Va name
+is a name of the driver which should attach the device (empty means any),
+.Va mw
+is a number of memory windows to allocate (empty means all available),
+.Va spad
+is a number of scratchpad registers to allocate (empty means all available),
+.Va db
+is a number of doorbells to allocate (empty means all available).
+The default configuration is empty string, which means single device
+with all available resources allowing any driver attachment.
+.El
+.Sh DESCRIPTION
+The NTB allows you to connect two computer systems using a PCIe link if they
+have the correct equipment and connectors.
+The
+.Nm ntb_hw
+driver provides support for the Non-Transparent Bridge (NTB) in the Intel S1200
+and Xeon E3/E5 processor families.
+The
+.Nm
+driver hides hardware details, exposing memory windows, scratchpads and
+doorbells via hardware independent KPI.
+.Pp
+The hardware provides 2-3 memory windows to the other system's memory,
+16 scratchpad registers and 14/34 doorbells to interrupt the other system.
+On Xeon processors one of memory windows is typically consumed by the driver
+to workaround multiple hardware erratas.
+.Sh CONFIGURATION
+The NTB configuration should be set by BIOS.
+It includes enabling NTB, choosing between NTB-to-NTB or NTB-to-Root Port mode,
+enabling split BAR mode (one of two 64-bit BARs can be split into two 32-bit
+ones) and configuring BAR sizes in bits (from 12 to 29/39) for both NTB sides.
+.Pp
+The recommended configuration is NTB-to-NTB mode, split bar is enabled and
+all BAR sizes are set to 20 (1 MiB).
+This needs to be done on both systems.
+.Sh SEE ALSO
+.Xr ntb_transport 4 ,
+.Xr if_ntb 4
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+driver was developed by Intel and originally written by
+.An Carl Delsey Aq Mt carl@FreeBSD.org .
+Later improvements were done by
+.An Conrad E. Meyer Aq Mt cem@FreeBSD.org
+and
+.An Alexander Motin Aq Mt mav@FreeBSD.org .
+.Sh BUGS
+NTB-to-Root Port mode is not yet supported, but it doesn't look very useful.
+.Pp
+On Xeon v2/v3/v4 processors split BAR mode should be enabled to allow
+SB01BASE_LOCKUP errata workaround to be applied by the driver.
+.Pp
+There is no way to protect your system from malicious behavior on the other
+system once the link is brought up.
+Anyone with root or kernel access on the other system can read or write to
+any location on your system.
+In other words, only connect two systems that completely trust each other.
diff --git a/share/man/man4/ntb_transport.4 b/share/man/man4/ntb_transport.4
new file mode 100644
index 0000000..43b412c
--- /dev/null
+++ b/share/man/man4/ntb_transport.4
@@ -0,0 +1,79 @@
+.\"
+.\" Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd July 29, 2016
+.Dt NTB_TRANSPORT 4
+.Os
+.Sh NAME
+.Nm ntb_transport
+.Nd Packet-oriented transport for Non-Transparent Bridges
+.Sh SYNOPSIS
+To load the driver as a module at boot, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+ntb_transport_load="YES"
+.Ed
+.Pp
+The following tunables are settable from the
+.Xr loader 8 :
+.Bl -ohang
+.It Va hw.ntb_transport.debug_level
+Driver debug level.
+The default value is 0, higher means more verbose.
+.It Va hint.ntb_transport. Ns Ar X Ns Va .config
+Configures queues allocation for consumer devices, separated by commas.
+Each device can be configured as: "<name>[:<queues>]", where:
+.Va name
+is a name of the driver which should attach the device (empty means any),
+.Va queues
+is a number of queues to allocate (empty means automatic),
+The default configuration is empty string, which means single device
+with one queue per memory window allowing any driver attachment.
+.El
+.Sh DESCRIPTION
+The
+.Nm
+driver attaches on top of the
+.Nm ntb
+driver to utilize its resources to create set of bidirectional queues,
+delivering packets between the systems.
+The primary purpose of this is to be used by
+.Nm if_ntb
+network interface, but other consumers may also be developed using KPI.
+.Sh SEE ALSO
+.Xr if_ntb 4 ,
+.Xr ntb_hw 4
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+driver was developed by Intel and originally written by
+.An Carl Delsey Aq Mt carl@FreeBSD.org .
+Later improvements were done by
+.An Conrad E. Meyer Aq Mt cem@FreeBSD.org
+and
+.An Alexander Motin Aq Mt mav@FreeBSD.org .
diff --git a/sys/boot/efi/loader/Makefile b/sys/boot/efi/loader/Makefile
index bc38ea6..2c430ec 100644
--- a/sys/boot/efi/loader/Makefile
+++ b/sys/boot/efi/loader/Makefile
@@ -50,6 +50,18 @@ CFLAGS+=	-DEFI_ZFS_BOOT
 .endif
 CFLAGS+=	-DNO_PCI -DEFI
 
+.if !defined(BOOT_HIDE_SERIAL_NUMBERS)
+# Export serial numbers, UUID, and asset tag from loader.
+CFLAGS+= -DSMBIOS_SERIAL_NUMBERS
+.if defined(BOOT_LITTLE_ENDIAN_UUID)
+# Use little-endian UUID format as defined in SMBIOS 2.6.
+CFLAGS+= -DSMBIOS_LITTLE_ENDIAN_UUID
+.elif defined(BOOT_NETWORK_ENDIAN_UUID)
+# Use network-endian UUID format for backward compatibility.
+CFLAGS+= -DSMBIOS_NETWORK_ENDIAN_UUID
+.endif
+.endif
+
 .if ${MK_FORTH} != "no"
 BOOT_FORTH=	yes
 CFLAGS+=	-DBOOT_FORTH
diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h
index 251d62d..1d56ac7 100644
--- a/sys/cam/cam_ccb.h
+++ b/sys/cam/cam_ccb.h
@@ -1084,7 +1084,17 @@ struct ccb_notify_acknowledge {
 	u_int     tag_id;		/* Tag for immediate notify */
 	u_int     seq_id;		/* Tar for target of notify */
 	u_int     initiator_id;		/* Initiator Identifier */
-	u_int     arg;			/* Function specific */
+	u_int     arg;			/* Response information */
+	/*
+	 * Lower byte of arg is one of RESPONSE CODE values defined below
+	 * (subset of response codes from SPL-4 and FCP-4 specifications),
+	 * upper 3 bytes is code-specific ADDITIONAL RESPONSE INFORMATION.
+	 */
+#define	CAM_RSP_TMF_COMPLETE		0x00
+#define	CAM_RSP_TMF_REJECTED		0x04
+#define	CAM_RSP_TMF_FAILED		0x05
+#define	CAM_RSP_TMF_SUCCEEDED		0x08
+#define	CAM_RSP_TMF_INCORRECT_LUN	0x09
 };
 
 /* HBA engine structures. */
diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
index 5455eea..8ec048b 100644
--- a/sys/cam/ctl/ctl.c
+++ b/sys/cam/ctl/ctl.c
@@ -1818,6 +1818,7 @@ ctl_init(void)
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	softc->flags = 0;
 
+	TUNABLE_INT_FETCH("kern.cam.ctl.ha_mode", (int *)&softc->ha_mode);
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_mode", CTLFLAG_RDTUN, (int *)&softc->ha_mode, 0,
 	    "HA mode (0 - act/stby, 1 - serialize only, 2 - xfer)");
@@ -1827,6 +1828,7 @@ ctl_init(void)
 	 * figured out through the slot the controller is in.  Although it
 	 * is an active/active system, someone has to be in charge.
 	 */
+	TUNABLE_INT_FETCH("kern.cam.ctl.ha_id", &softc->ha_id);
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_id", CTLFLAG_RDTUN, &softc->ha_id, 0,
 	    "HA head ID (0 - no HA)");
diff --git a/sys/cam/ctl/scsi_ctl.c b/sys/cam/ctl/scsi_ctl.c
index 2705240..abdbdcd 100644
--- a/sys/cam/ctl/scsi_ctl.c
+++ b/sys/cam/ctl/scsi_ctl.c
@@ -1552,6 +1552,7 @@ ctlfedone(struct cam_periph *periph, union ccb *done_ccb)
 		/*
 		 * Queue this back down to the SIM as an immediate notify.
 		 */
+		done_ccb->ccb_h.status = CAM_REQ_INPROG;
 		done_ccb->ccb_h.func_code = XPT_IMMEDIATE_NOTIFY;
 		xpt_action(done_ccb);
 		break;
@@ -2040,6 +2041,28 @@ ctlfe_done(union ctl_io *io)
 		 */
 		ccb->ccb_h.status = CAM_REQ_INPROG;
 		ccb->ccb_h.func_code = XPT_NOTIFY_ACKNOWLEDGE;
+		switch (io->taskio.task_status) {
+		case CTL_TASK_FUNCTION_COMPLETE:
+			ccb->cna2.arg = CAM_RSP_TMF_COMPLETE;
+			break;
+		case CTL_TASK_FUNCTION_SUCCEEDED:
+			ccb->cna2.arg = CAM_RSP_TMF_SUCCEEDED;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		case CTL_TASK_FUNCTION_REJECTED:
+			ccb->cna2.arg = CAM_RSP_TMF_REJECTED;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		case CTL_TASK_LUN_DOES_NOT_EXIST:
+			ccb->cna2.arg = CAM_RSP_TMF_INCORRECT_LUN;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		case CTL_TASK_FUNCTION_NOT_SUPPORTED:
+			ccb->cna2.arg = CAM_RSP_TMF_FAILED;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		}
+		ccb->cna2.arg |= scsi_3btoul(io->taskio.task_resp) << 8;
 		xpt_action(ccb);
 	} else if (io->io_hdr.flags & CTL_FLAG_STATUS_SENT) {
 		if (softc->flags & CTLFE_LUN_WILDCARD) {
diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h
index 4e5b1c9..019efdf 100644
--- a/sys/cddl/compat/opensolaris/sys/vnode.h
+++ b/sys/cddl/compat/opensolaris/sys/vnode.h
@@ -87,8 +87,6 @@ vn_is_readonly(vnode_t *vp)
 #define	VN_RELE(v)	vrele(v)
 #define	VN_URELE(v)	vput(v)
 
-#define	VOP_REALVP(vp, vpp, ct)	(*(vpp) = (vp), 0)
-
 #define	vnevent_create(vp, ct)			do { } while (0)
 #define	vnevent_link(vp, ct)			do { } while (0)
 #define	vnevent_remove(vp, dvp, name, ct)	do { } while (0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
index 349f8ef..22d8e60 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -48,18 +48,18 @@ extern "C" {
 #define	IS_ROOT_NODE	0x01		/* create a root node */
 #define	IS_XATTR	0x02		/* create an extended attribute node */
 
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
-    int, int *, pathname_t *);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
+extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
     boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
-    pathname_t *);
+#if 0
+extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
+#else
+extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
+#endif
 extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
     uint_t, znode_t **, zfs_acl_ids_t *);
 extern void zfs_rmnode(znode_t *);
-extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
 extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
 extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
index 4120883..df5ce05 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -75,6 +75,7 @@ struct zfsvfs {
 	boolean_t	z_use_fuids;	/* version allows fuids */
 	boolean_t	z_replay;	/* set during ZIL replay */
 	boolean_t	z_use_sa;	/* version allow system attributes */
+	boolean_t	z_use_namecache;/* make use of FreeBSD name cache */
 	uint64_t	z_version;	/* ZPL version */
 	uint64_t	z_shares_dir;	/* hidden shares dir */
 	kmutex_t	z_lock;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index 3e72ec4..7649295 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -181,10 +181,12 @@ typedef struct znode {
 	struct zfsvfs	*z_zfsvfs;
 	vnode_t		*z_vnode;
 	uint64_t	z_id;		/* object ID for this znode */
+#ifdef illumos
 	kmutex_t	z_lock;		/* znode modification lock */
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
+#endif
 	kmutex_t	z_range_lock;	/* protects changes to z_range_avl */
 	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
 	uint8_t		z_unlinked;	/* file has been unlinked */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index fd1d59b..2e94ccc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -1058,8 +1058,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
  * create a new acl and leave any cached acl in place.
  */
 static int
-zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
-    boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 {
 	zfs_acl_t	*aclp;
 	int		aclsize;
@@ -1068,26 +1067,15 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
 	zfs_acl_phys_t	znode_acl;
 	int		version;
 	int		error;
-	boolean_t	drop_lock = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 
 	if (zp->z_acl_cached && !will_modify) {
 		*aclpp = zp->z_acl_cached;
 		return (0);
 	}
 
-	/*
-	 * close race where znode could be upgrade while trying to
-	 * read the znode attributes.
-	 *
-	 * But this could only happen if the file isn't already an SA
-	 * znode
-	 */
-	if (!zp->z_is_sa && !have_lock) {
-		mutex_enter(&zp->z_lock);
-		drop_lock = B_TRUE;
-	}
 	version = zfs_znode_acl_version(zp);
 
 	if ((error = zfs_acl_znode_info(zp, &aclsize,
@@ -1133,8 +1121,6 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
 	if (!will_modify)
 		zp->z_acl_cached = aclp;
 done:
-	if (drop_lock)
-		mutex_exit(&zp->z_lock);
 	return (error);
 }
 
@@ -1161,10 +1147,10 @@ zfs_acl_chown_setattr(znode_t *zp)
 	int error;
 	zfs_acl_t *aclp;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
-	if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+	if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
 		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
 		    &zp->z_pflags, zp->z_uid, zp->z_gid);
 	return (error);
@@ -1445,18 +1431,17 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 	int error = 0;
 
 	mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
 		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
 	else
-		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+		error = zfs_acl_node_read(zp, aclp, B_TRUE);
 
 	if (error == 0) {
 		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
 		zfs_acl_chmod(ZTOV(zp)->v_type, mode,
 		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
 	}
-	mutex_exit(&zp->z_lock);
 	mutex_exit(&zp->z_acl_lock);
 
 	return (error);
@@ -1627,6 +1612,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 	boolean_t	need_chmod = B_TRUE;
 	boolean_t	inherited = B_FALSE;
 
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
 	bzero(acl_ids, sizeof (zfs_acl_ids_t));
 	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
@@ -1710,12 +1696,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 
 	if (acl_ids->z_aclp == NULL) {
 		mutex_enter(&dzp->z_acl_lock);
-		mutex_enter(&dzp->z_lock);
 		if (!(flag & IS_ROOT_NODE) &&
 		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
 		    !(dzp->z_pflags & ZFS_XATTR)) {
-			VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
-			    &paclp, B_FALSE));
+			VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
 			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
 			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
 			inherited = B_TRUE;
@@ -1724,7 +1708,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
 			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
 		}
-		mutex_exit(&dzp->z_lock);
 		mutex_exit(&dzp->z_acl_lock);
 		if (need_chmod) {
 			acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
@@ -1790,7 +1773,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
@@ -1938,6 +1922,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	boolean_t	fuid_dirtied;
 	uint64_t	acl_obj;
 
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	if (mask == 0)
 		return (SET_ERROR(ENOSYS));
 
@@ -1962,7 +1947,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	}
 top:
 	mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 
@@ -1994,7 +1978,6 @@ top:
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		mutex_exit(&zp->z_lock);
 		mutex_exit(&zp->z_acl_lock);
 
 		if (error == ERESTART) {
@@ -2020,7 +2003,6 @@ top:
 	if (fuidp)
 		zfs_fuid_info_free(fuidp);
 	dmu_tx_commit(tx);
-	mutex_exit(&zp->z_lock);
 	mutex_exit(&zp->z_acl_lock);
 
 	return (error);
@@ -2124,7 +2106,8 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
 
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index cf42ff6..f8f695b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -58,96 +58,64 @@
 #include <sys/extdirent.h>
 
 /*
- * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
  * of names after deciding which is the appropriate lookup interface.
  */
 static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
-    boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+    boolean_t exact, uint64_t *zoid)
 {
 	int error;
 
 	if (zfsvfs->z_norm) {
-		matchtype_t mt = MT_FIRST;
-		boolean_t conflict = B_FALSE;
-		size_t bufsz = 0;
-		char *buf = NULL;
-
-		if (rpnp) {
-			buf = rpnp->pn_buf;
-			bufsz = rpnp->pn_bufsize;
-		}
-		if (exact)
-			mt = MT_EXACT;
+		matchtype_t mt = exact? MT_EXACT : MT_FIRST;
+
 		/*
 		 * In the non-mixed case we only expect there would ever
 		 * be one match, but we need to use the normalizing lookup.
 		 */
 		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
-		    zoid, mt, buf, bufsz, &conflict);
-		if (!error && deflags)
-			*deflags = conflict ? ED_CASE_CONFLICT : 0;
+		    zoid, mt, NULL, 0, NULL);
 	} else {
 		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
 	}
 	*zoid = ZFS_DIRENT_OBJ(*zoid);
 
-	if (error == ENOENT && update)
-		dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-
 	return (error);
 }
 
 /*
- * Lock a directory entry.  A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object.  As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
  *
  * Input arguments:
  *	dzp	- znode for directory
  *	name	- name of entry to lock
  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
- *		  ZSHARED: allow concurrent access with other ZSHARED callers.
  *		  ZXATTR: we want dzp's xattr directory
- *		  ZCILOOK: On a mixed sensitivity file system,
- *			   this lookup should be case-insensitive.
- *		  ZCIEXACT: On a purely case-insensitive file system,
- *			    this lookup should be case-sensitive.
- *		  ZRENAMING: we are locking for renaming, force narrow locks
- *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
- *			     current thread already holds it.
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
- *	dlpp	- pointer to the dirlock for this entry (NULL on error)
- *      direntflags - (case-insensitive lookup only)
- *		flags if multiple case-sensitive matches exist in directory
- *      realpnp     - (case-insensitive lookup only)
- *		actual name matched within the directory
  *
  * Return value: 0 on success or errno on failure.
  *
  * NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- *	 but return znode pointers to a single match.
  */
 int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
-    int flag, int *direntflags, pathname_t *realpnp)
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	*dl;
-	boolean_t	update;
 	boolean_t	exact;
 	uint64_t	zoid;
 	vnode_t		*vp = NULL;
 	int		error = 0;
-	int		cmpflags;
+
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
 
 	*zpp = NULL;
-	*dlpp = NULL;
 
 	/*
 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
@@ -161,280 +129,93 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 	 * Case sensitivity and normalization preferences are set when
 	 * the file system is created.  These are stored in the
 	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
-	 * affect what vnodes can be cached in the DNLC, how we
-	 * perform zap lookups, and the "width" of our dirlocks.
+	 * affect how we perform zap lookups.
 	 *
-	 * A normal dirlock locks a single name.  Note that with
-	 * normalization a name can be composed multiple ways, but
-	 * when normalized, these names all compare equal.  A wide
-	 * dirlock locks multiple names.  We need these when the file
-	 * system is supporting mixed-mode access.  It is sometimes
-	 * necessary to lock all case permutations of file name at
-	 * once so that simultaneous case-insensitive/case-sensitive
-	 * behaves as rationally as possible.
-	 */
-
-	/*
 	 * Decide if exact matches should be requested when performing
 	 * a zap lookup on file systems supporting case-insensitive
 	 * access.
-	 */
-	exact =
-	    ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
-
-	/*
-	 * Only look in or update the DNLC if we are looking for the
-	 * name on a file system that does not require normalization
-	 * or case folding.  We can also look there if we happen to be
-	 * on a non-normalizing, mixed sensitivity file system IF we
-	 * are looking for the exact name.
 	 *
-	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
-	 * case for performance improvement?
-	 */
-	update = !zfsvfs->z_norm ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
-	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
-
-	/*
-	 * ZRENAMING indicates we are in a situation where we should
-	 * take narrow locks regardless of the file system's
-	 * preferences for normalizing and case folding.  This will
-	 * prevent us deadlocking trying to grab the same wide lock
-	 * twice if the two names happen to be case-insensitive
-	 * matches.
-	 */
-	if (flag & ZRENAMING)
-		cmpflags = 0;
-	else
-		cmpflags = zfsvfs->z_norm;
-
-	/*
-	 * Wait until there are no locks on this name.
-	 *
-	 * Don't grab the the lock if it is already held. However, cannot
-	 * have both ZSHARED and ZHAVELOCK together.
-	 */
-	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
-	if (!(flag & ZHAVELOCK))
-		rw_enter(&dzp->z_name_lock, RW_READER);
-
-	mutex_enter(&dzp->z_lock);
-	for (;;) {
-		if (dzp->z_unlinked && !(flag & ZXATTR)) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (SET_ERROR(ENOENT));
-		}
-		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
-			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
-			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
-				break;
-		}
-		if (error != 0) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (SET_ERROR(ENOENT));
-		}
-		if (dl == NULL)	{
-			size_t namesize;
-
-			/*
-			 * Allocate a new dirlock and add it to the list.
-			 */
-			namesize = strlen(name) + 1;
-			dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize,
-			    KM_SLEEP);
-			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
-			dl->dl_name = (char *)(dl + 1);
-			bcopy(name, dl->dl_name, namesize);
-			dl->dl_sharecnt = 0;
-			dl->dl_namelock = 0;
-			dl->dl_namesize = namesize;
-			dl->dl_dzp = dzp;
-			dl->dl_next = dzp->z_dirlocks;
-			dzp->z_dirlocks = dl;
-			break;
-		}
-		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
-			break;
-		cv_wait(&dl->dl_cv, &dzp->z_lock);
-	}
-
-	/*
-	 * If the z_name_lock was NOT held for this dirlock record it.
+	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+	 * because in that case MT_EXACT and MT_FIRST should produce exactly
+	 * the same result.
 	 */
-	if (flag & ZHAVELOCK)
-		dl->dl_namelock = 1;
+	exact = zfsvfs->z_case == ZFS_CASE_MIXED;
 
-	if (flag & ZSHARED)
-		dl->dl_sharecnt++;
-
-	mutex_exit(&dzp->z_lock);
-
-	/*
-	 * We have a dirlock on the name.  (Note that it is the dirlock,
-	 * not the dzp's z_lock, that protects the name in the zap object.)
-	 * See if there's an object by this name; if so, put a hold on it.
-	 */
+	if (dzp->z_unlinked && !(flag & ZXATTR))
+		return (ENOENT);
 	if (flag & ZXATTR) {
 		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
 		    sizeof (zoid));
 		if (error == 0)
 			error = (zoid == 0 ? ENOENT : 0);
 	} else {
-		if (update)
-			vp = dnlc_lookup(ZTOV(dzp), name);
-		if (vp == DNLC_NO_VNODE) {
-			VN_RELE(vp);
-			error = SET_ERROR(ENOENT);
-		} else if (vp) {
-			if (flag & ZNEW) {
-				zfs_dirent_unlock(dl);
-				VN_RELE(vp);
-				return (SET_ERROR(EEXIST));
-			}
-			*dlpp = dl;
-			*zpp = VTOZ(vp);
-			return (0);
-		} else {
-			error = zfs_match_find(zfsvfs, dzp, name, exact,
-			    update, direntflags, realpnp, &zoid);
-		}
+		error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
 	}
 	if (error) {
 		if (error != ENOENT || (flag & ZEXISTS)) {
-			zfs_dirent_unlock(dl);
 			return (error);
 		}
 	} else {
 		if (flag & ZNEW) {
-			zfs_dirent_unlock(dl);
 			return (SET_ERROR(EEXIST));
 		}
 		error = zfs_zget(zfsvfs, zoid, zpp);
-		if (error) {
-			zfs_dirent_unlock(dl);
+		if (error)
 			return (error);
-		}
-		if (!(flag & ZXATTR) && update)
-			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+		ASSERT(!(*zpp)->z_unlinked);
 	}
 
-	*dlpp = dl;
-
 	return (0);
 }
 
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
 {
-	znode_t *dzp = dl->dl_dzp;
-	zfs_dirlock_t **prev_dl, *cur_dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	znode_t *zp;
+	uint64_t parent;
+	int error;
 
-	mutex_enter(&dzp->z_lock);
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
-	if (!dl->dl_namelock)
-		rw_exit(&dzp->z_name_lock);
+	if (dzp->z_unlinked)
+		return (ENOENT);
 
-	if (dl->dl_sharecnt > 1) {
-		dl->dl_sharecnt--;
-		mutex_exit(&dzp->z_lock);
-		return;
-	}
-	prev_dl = &dzp->z_dirlocks;
-	while ((cur_dl = *prev_dl) != dl)
-		prev_dl = &cur_dl->dl_next;
-	*prev_dl = dl->dl_next;
-	cv_broadcast(&dl->dl_cv);
-	mutex_exit(&dzp->z_lock);
-
-	cv_destroy(&dl->dl_cv);
-	kmem_free(dl, sizeof (*dl) + dl->dl_namesize);
+	if ((error = sa_lookup(dzp->z_sa_hdl,
+	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+		return (error);
+
+	error = zfs_zget(zfsvfs, parent, &zp);
+	if (error == 0)
+		*zpp = zp;
+	return (error);
 }
 
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- *	no directory entries are actually stored for them.  If this is
- *	the root of a filesystem, then '.zfs' is also treated as a
- *	special pseudo-directory.
- */
 int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
-    int *deflg, pathname_t *rpnp)
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
 {
-	zfs_dirlock_t *dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	znode_t *zp;
 	int error = 0;
-	uint64_t parent;
-	int unlinked;
-
-	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
-		mutex_enter(&dzp->z_lock);
-		unlinked = dzp->z_unlinked;
-		mutex_exit(&dzp->z_lock);
-		if (unlinked)
-			return (ENOENT);
-
-		*vpp = ZTOV(dzp);
-		VN_HOLD(*vpp);
-	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
-		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 
-		/*
-		 * If we are a snapshot mounted under .zfs, return
-		 * the vp for the snapshot directory.
-		 */
-		if ((error = sa_lookup(dzp->z_sa_hdl,
-		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
-			return (error);
-		if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
-			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
-			    "snapshot", vpp, NULL, 0, NULL, kcred,
-			    NULL, NULL, NULL);
-			return (error);
-		}
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
-		mutex_enter(&dzp->z_lock);
-		unlinked = dzp->z_unlinked;
-		mutex_exit(&dzp->z_lock);
-		if (unlinked)
-			return (ENOENT);
+	if (dzp->z_unlinked)
+		return (SET_ERROR(ENOENT));
 
-		rw_enter(&dzp->z_parent_lock, RW_READER);
-		error = zfs_zget(zfsvfs, parent, &zp);
-		if (error == 0)
-			*vpp = ZTOV(zp);
-		rw_exit(&dzp->z_parent_lock);
-	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
-		*vpp = zfsctl_root(dzp);
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*zpp = dzp;
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		error = zfs_dd_lookup(dzp, zpp);
 	} else {
-		int zf;
-
-		zf = ZEXISTS | ZSHARED;
-		if (flags & FIGNORECASE)
-			zf |= ZCILOOK;
-
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
 		if (error == 0) {
-			*vpp = ZTOV(zp);
-			zfs_dirent_unlock(dl);
 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+			*zpp = zp;
 		}
-		rpnp = NULL;
 	}
-
-	if ((flags & FIGNORECASE) && rpnp && !error)
-		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
 	return (error);
 }
 
@@ -510,8 +291,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 		if (error != 0)
 			continue;
 
+		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 		zp->z_unlinked = B_TRUE;
-		VN_RELE(ZTOV(zp));
+		vput(ZTOV(zp));
 	}
 	zap_cursor_fini(&zc);
 }
@@ -535,7 +317,6 @@ zfs_purgedir(znode_t *dzp)
 	znode_t		*xzp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	dl;
 	int skipped = 0;
 	int error;
 
@@ -549,6 +330,7 @@ zfs_purgedir(znode_t *dzp)
 			continue;
 		}
 
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
 		    (ZTOV(xzp)->v_type == VLNK));
 
@@ -563,20 +345,17 @@ zfs_purgedir(znode_t *dzp)
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
-			VN_RELE(ZTOV(xzp));
+			vput(ZTOV(xzp));
 			skipped += 1;
 			continue;
 		}
-		bzero(&dl, sizeof (dl));
-		dl.dl_dzp = dzp;
-		dl.dl_name = zap.za_name;
 
-		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
 		if (error)
 			skipped += 1;
 		dmu_tx_commit(tx);
 
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT)
@@ -596,6 +375,7 @@ zfs_rmnode(znode_t *zp)
 	int		error;
 
 	ASSERT(zp->z_links == 0);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	/*
 	 * If this is an attribute directory, purge its contents.
@@ -634,7 +414,8 @@ zfs_rmnode(znode_t *zp)
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT(error == 0);
+		ASSERT3S(error, ==, 0);
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	acl_obj = zfs_external_acl(zp);
@@ -668,12 +449,10 @@ zfs_rmnode(znode_t *zp)
 
 	if (xzp) {
 		ASSERT(error == 0);
-		mutex_enter(&xzp->z_lock);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
 		xzp->z_links = 0;	/* no more links to it */
 		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 		    &xzp->z_links, sizeof (xzp->z_links), tx));
-		mutex_exit(&xzp->z_lock);
 		zfs_unlinked_add(xzp, tx);
 	}
 
@@ -686,7 +465,7 @@ zfs_rmnode(znode_t *zp)
 	dmu_tx_commit(tx);
 out:
 	if (xzp)
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 }
 
 static uint64_t
@@ -700,12 +479,12 @@ zfs_dirent(znode_t *zp, uint64_t mode)
 }
 
 /*
- * Link zp into dl.  Can only fail if zp has been unlinked.
+ * Link zp into dzp.  Can only fail if zp has been unlinked.
  */
 int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
 {
-	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	uint64_t value;
@@ -715,18 +494,32 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 	int count = 0;
 	int error;
 
-	mutex_enter(&zp->z_lock);
-
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#if 0
+	if (zp_is_dir) {
+		error = 0;
+		if (dzp->z_links >= LINK_MAX)
+			error = SET_ERROR(EMLINK);
+		return (error);
+	}
+#endif
 	if (!(flag & ZRENAMING)) {
 		if (zp->z_unlinked) {	/* no new links to unlinked zp */
 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
-			mutex_exit(&zp->z_lock);
 			return (SET_ERROR(ENOENT));
 		}
+#if 0
+		if (zp->z_links >= LINK_MAX) {
+			return (SET_ERROR(EMLINK));
+		}
+#endif
 		zp->z_links++;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 		    &zp->z_links, sizeof (zp->z_links));
 
+	} else {
+		ASSERT(zp->z_unlinked == 0);
 	}
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 	    &dzp->z_id, sizeof (dzp->z_id));
@@ -740,11 +533,8 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 		    ctime, B_TRUE);
 	}
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-
-	mutex_exit(&zp->z_lock);
+	ASSERT0(error);
 
-	mutex_enter(&dzp->z_lock);
 	dzp->z_size++;
 	dzp->z_links += zp_is_dir;
 	count = 0;
@@ -760,55 +550,48 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 	    &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-	mutex_exit(&dzp->z_lock);
+	ASSERT0(error);
 
 	value = zfs_dirent(zp, zp->z_mode);
-	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
 	    8, 1, &value, tx);
-	ASSERT(error == 0);
-
-	dnlc_update(ZTOV(dzp), dl->dl_name, vp);
+	VERIFY0(error);
 
 	return (0);
 }
 
 static int
-zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
     int flag)
 {
 	int error;
 
 	if (zp->z_zfsvfs->z_norm) {
-		if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
-		    (flag & ZCIEXACT)) ||
-		    ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
-		    !(flag & ZCILOOK)))
+		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_EXACT, tx);
+			    dzp->z_id, name, MT_EXACT, tx);
 		else
 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_FIRST, tx);
+			    dzp->z_id, name, MT_FIRST, tx);
 	} else {
 		error = zap_remove(zp->z_zfsvfs->z_os,
-		    dzp->z_id, dl->dl_name, tx);
+		    dzp->z_id, name, tx);
 	}
 
 	return (error);
 }
 
 /*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
  * and it's the caller's job to do it.
  */
 int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
-    boolean_t *unlinkedp)
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag, boolean_t *unlinkedp)
 {
-	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	int zp_is_dir = (vp->v_type == VDIR);
@@ -818,22 +601,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 	int count = 0;
 	int error;
 
-	dnlc_remove(ZTOV(dzp), dl->dl_name);
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	if (!(flag & ZRENAMING)) {
-		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
-			return (SET_ERROR(EBUSY));
-
-		if (vn_ismntpt(vp)) {		/* don't remove mount point */
-			vn_vfsunlock(vp);
-			return (SET_ERROR(EBUSY));
-		}
-
-		mutex_enter(&zp->z_lock);
 
 		if (zp_is_dir && !zfs_dirempty(zp)) {
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
 #ifdef illumos
 			return (SET_ERROR(EEXIST));
 #else
@@ -846,10 +619,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 		 * First try removing the name from the directory; if that
 		 * fails, return the error.
 		 */
-		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0) {
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
 			return (error);
 		}
 
@@ -876,16 +647,14 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 		    NULL, &zp->z_links, sizeof (zp->z_links));
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		count = 0;
-		ASSERT(error == 0);
-		mutex_exit(&zp->z_lock);
-		vn_vfsunlock(vp);
+		ASSERT0(error);
 	} else {
-		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		ASSERT(zp->z_unlinked == 0);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0)
 			return (error);
 	}
 
-	mutex_enter(&dzp->z_lock);
 	dzp->z_size--;		/* one dirent removed */
 	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
@@ -900,8 +669,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-	mutex_exit(&dzp->z_lock);
+	ASSERT0(error);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
@@ -912,14 +680,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 }
 
 /*
- * Indicate whether the directory is empty.  Works with or without z_lock
- * held, but can only be consider a hint in the latter case.  Returns true
- * if only "." and ".." remain and there's no work in progress.
+ * Indicate whether the directory is empty.
  */
 boolean_t
 zfs_dirempty(znode_t *dzp)
 {
-	return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
+	return (dzp->z_size == 2);
 }
 
 int
@@ -1013,23 +779,20 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	znode_t		*xzp;
-	zfs_dirlock_t	*dl;
 	vattr_t		va;
 	int		error;
 top:
-	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
 	if (error)
 		return (error);
 
 	if (xzp != NULL) {
 		*xvpp = ZTOV(xzp);
-		zfs_dirent_unlock(dl);
 		return (0);
 	}
 
 
 	if (!(flags & CREATE_XATTR_DIR)) {
-		zfs_dirent_unlock(dl);
 #ifdef illumos
 		return (SET_ERROR(ENOENT));
 #else
@@ -1038,7 +801,6 @@ top:
 	}
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		zfs_dirent_unlock(dl);
 		return (SET_ERROR(EROFS));
 	}
 
@@ -1058,7 +820,6 @@ top:
 	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
-	zfs_dirent_unlock(dl);
 
 	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
index 3a472aa..819eca2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
@@ -124,7 +124,7 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	xoptattr_t *xoap;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
 	if (zp->z_is_sa) {
 		if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -158,7 +158,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	xoptattr_t *xoap;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
 	if (zp->z_is_sa)
 		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -205,7 +205,6 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
 	uint64_t crtime[2], mtime[2], ctime[2];
 	zfs_acl_phys_t znode_acl;
 	char scanstamp[AV_SCANSTAMP_SZ];
-	boolean_t drop_lock = B_FALSE;
 
 	/*
 	 * No upgrade if ACL isn't cached
@@ -217,20 +216,16 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
 		return;
 
 	/*
-	 * If the z_lock is held and we aren't the owner
-	 * the just return since we don't want to deadlock
+	 * If the vnode lock is held and we aren't the owner
+	 * then just return since we don't want to deadlock
 	 * trying to update the status of z_is_sa.  This
 	 * file can then be upgraded at a later time.
 	 *
 	 * Otherwise, we know we are doing the
 	 * sa_update() that caused us to enter this function.
 	 */
-	if (mutex_owner(&zp->z_lock) != curthread) {
-		if (mutex_tryenter(&zp->z_lock) == 0)
+	if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
 			return;
-		else
-			drop_lock = B_TRUE;
-	}
 
 	/* First do a bulk query of the attributes that aren't cached */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
@@ -311,8 +306,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
 
 	zp->z_is_sa = B_TRUE;
 done:
-	if (drop_lock)
-		mutex_exit(&zp->z_lock);
+	VOP_UNLOCK(ZTOV(zp), 0);
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 8523bc4..aa711f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -957,6 +957,18 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 	else if (error != 0)
 		return (error);
 
+	/*
+	 * Only use the name cache if we are looking for a
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name (which is always the case on
+	 * FreeBSD).
+	 */
+	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
 	return (0);
 }
 
@@ -997,7 +1009,11 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
+#ifdef DIAGNOSTIC
+	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -2044,7 +2060,7 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
-		VN_RELE(ZTOV(zp));
+		vrele(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err == 0)
@@ -2145,7 +2161,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 			VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
-			VN_HOLD(*vpp);
+			vref(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		err = vn_lock(*vpp, flags);
@@ -2168,7 +2184,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
-		VN_RELE(ZTOV(zp));
+		vrele(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 17179f6..e2fe974 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -66,7 +66,6 @@
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
-#include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
@@ -147,7 +146,7 @@
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
- *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
@@ -1433,26 +1432,81 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
 	return (error);
 }
 
-/*
- * If vnode is for a device return a specfs vnode instead.
- */
 static int
-specvp_check(vnode_t **vpp, cred_t *cr)
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
-	int error = 0;
-
-	if (IS_DEVVP(*vpp)) {
-		struct vnode *svp;
+	int error;
 
-		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-		VN_RELE(*vpp);
-		if (svp == NULL)
-			error = SET_ERROR(ENOSYS);
-		*vpp = svp;
-	}
+	*vpp = arg;
+	error = vn_lock(*vpp, lkflags);
+	if (error != 0)
+		vrele(*vpp);
 	return (error);
 }
 
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int error;
+	int ltype;
+
+	ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+	ASSERT(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		ASSERT3P(dvp, ==, vp);
+		vref(dvp);
+		ltype = lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(dvp)) {
+			if (ltype == LK_EXCLUSIVE)
+				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+			else /* if (ltype == LK_SHARED) */
+				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+			/*
+			 * Relock for the "." case could leave us with
+			 * reclaimed vnode.
+			 */
+			if (dvp->v_iflag & VI_DOOMED) {
+				vrele(dvp);
+				return (SET_ERROR(ENOENT));
+			}
+		}
+		return (0);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		/*
+		 * Note that in this case, dvp is the child vnode, and we
+		 * are looking up the parent vnode - exactly reverse from
+		 * normal operation.  Unlocking dvp requires some rather
+		 * tricky unlock/relock dance to prevent mp from being freed;
+		 * use vn_vget_ino_gen() which takes care of all that.
+		 *
+		 * XXX Note that there is a time window when both vnodes are
+		 * unlocked.  It is possible, although highly unlikely, that
+		 * during that window the parent-child relationship between
+		 * the vnodes may change, for example, get reversed.
+		 * In that case we would have a wrong lock order for the vnodes.
+		 * All other filesystems seem to ignore this problem, so we
+		 * do the same here.
+		 * A potential solution could be implemented as follows:
+		 * - using LK_NOWAIT when locking the second vnode and retrying
+		 *   if necessary
+		 * - checking that the parent-child relationship still holds
+		 *   after locking both vnodes and retrying if it doesn't
+		 */
+		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+		return (error);
+	} else {
+		error = vn_lock(vp, lkflags);
+		if (error != 0)
+			vrele(vp);
+		return (error);
+	}
+}
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
@@ -1465,8 +1519,6 @@ specvp_check(vnode_t **vpp, cred_t *cr)
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
- *		direntflags - directory lookup flags
- *		realpnp - returned pathname.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
@@ -1481,46 +1533,17 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
+	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
-	int *direntflags = NULL;
-	void *realpnp = NULL;
-
-	/* fast path */
-	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
+	/* fast path (should be redundant with vfs namecache) */
+	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
-
-		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
-			error = zfs_fastaccesschk_execute(zdp, cr);
-			if (!error) {
-				*vpp = dvp;
-				VN_HOLD(*vpp);
-				return (0);
-			}
-			return (error);
-		} else {
-			vnode_t *tvp = dnlc_lookup(dvp, nm);
-
-			if (tvp) {
-				error = zfs_fastaccesschk_execute(zdp, cr);
-				if (error) {
-					VN_RELE(tvp);
-					return (error);
-				}
-				if (tvp == DNLC_NO_VNODE) {
-					VN_RELE(tvp);
-					return (SET_ERROR(ENOENT));
-				} else {
-					*vpp = tvp;
-					return (specvp_check(vpp, cr));
-				}
-			}
-		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
@@ -1558,10 +1581,9 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
-
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
-			VN_RELE(*vpp);
+			vrele(*vpp);
 			*vpp = NULL;
 		}
 
@@ -1569,15 +1591,9 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 		return (error);
 	}
 
-	if (dvp->v_type != VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOTDIR));
-	}
-
 	/*
 	 * Check accessibility of directory.
 	 */
-
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -1589,9 +1605,90 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 		return (SET_ERROR(EILSEQ));
 	}
 
-	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
-	if (error == 0)
-		error = specvp_check(vpp, cr);
+
+	/*
+	 * First handle the special cases.
+	 */
+	if ((cnp->cn_flags & ISDOTDOT) != 0) {
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+			    "snapshot", vpp, NULL, 0, NULL, kcred,
+			    NULL, NULL, NULL);
+			ZFS_EXIT(zfsvfs);
+			if (error == 0) {
+				error = zfs_lookup_lock(dvp, *vpp, nm,
+				    cnp->cn_lkflags);
+			}
+			goto out;
+		}
+	}
+	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+		error = 0;
+		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+			error = SET_ERROR(ENOTSUP);
+		else
+			*vpp = zfsctl_root(zdp);
+		ZFS_EXIT(zfsvfs);
+		if (error == 0)
+			error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		goto out;
+	}
+
+	/*
+	 * The loop is retry the lookup if the parent-child relationship
+	 * changes during the dot-dot locking complexities.
+	 */
+	for (;;) {
+		uint64_t parent;
+
+		error = zfs_dirlook(zdp, nm, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+
+		ZFS_EXIT(zfsvfs);
+		if (error != 0)
+			break;
+
+		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		if (error != 0) {
+			/*
+			 * If we've got a locking error, then the vnode
+			 * got reclaimed because of a force unmount.
+			 * We never enter doomed vnodes into the name cache.
+			 */
+			*vpp = NULL;
+			return (error);
+		}
+
+		if ((cnp->cn_flags & ISDOTDOT) == 0)
+			break;
+
+		ZFS_ENTER(zfsvfs);
+		if (zdp->z_sa_hdl == NULL) {
+			error = SET_ERROR(EIO);
+		} else {
+			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+			    &parent, sizeof (parent));
+		}
+		if (error != 0) {
+			ZFS_EXIT(zfsvfs);
+			vput(ZTOV(zp));
+			break;
+		}
+		if (zp->z_id == parent) {
+			ZFS_EXIT(zfsvfs);
+			break;
+		}
+		vput(ZTOV(zp));
+	}
+
+out:
+	if (error != 0)
+		*vpp = NULL;
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
@@ -1610,42 +1707,20 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 			break;
 		}
 	}
-	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
-		int ltype = 0;
 
-		if (cnp->cn_flags & ISDOTDOT) {
-			ltype = VOP_ISLOCKED(dvp);
-			VOP_UNLOCK(dvp, 0);
-		}
-		ZFS_EXIT(zfsvfs);
-		error = vn_lock(*vpp, cnp->cn_lkflags);
-		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(dvp, ltype | LK_RETRY);
-		if (error != 0) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-			return (error);
-		}
-	} else {
-		ZFS_EXIT(zfsvfs);
-	}
+	/* Insert name into cache (as non-existent) if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(dvp, NULL, cnp);
 
-#ifdef FREEBSD_NAMECACHE
-	/*
-	 * Insert name into cache (as non-existent) if appropriate.
-	 */
-	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
-		cache_enter(dvp, *vpp, cnp);
-	/*
-	 * Insert name into cache if appropriate.
-	 */
-	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+	/* Insert name into cache if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
-#endif
 
 	return (error);
 }
@@ -1683,7 +1758,6 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
@@ -1691,10 +1765,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
-	boolean_t	have_acl = B_FALSE;
-	boolean_t	waited = B_FALSE;
 	void		*vsecp = NULL;
 	int		flag = 0;
+	uint64_t	txtype;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
@@ -1731,182 +1804,89 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 		}
 	}
 
-	getnewvnode_reserve(1);
-
-top:
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
-	if (*name == '\0') {
-		/*
-		 * Null component name refers to the directory itself.
-		 */
-		VN_HOLD(dvp);
-		zp = dzp;
-		dl = NULL;
-		error = 0;
-	} else {
-		/* possible VN_HOLD(zp) */
-		int zflg = 0;
-
-		if (flag & FIGNORECASE)
-			zflg |= ZCILOOK;
-
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-		    NULL, NULL);
-		if (error) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			if (strcmp(name, "..") == 0)
-				error = SET_ERROR(EISDIR);
-			getnewvnode_drop_reserve();
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
 	}
+	ASSERT3P(zp, ==, NULL);
 
-	if (zp == NULL) {
-		uint64_t txtype;
-
-		/*
-		 * Create a new file object and update the directory
-		 * to reference it.
-		 */
-		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			goto out;
-		}
-
-		/*
-		 * We only support the creation of regular files in
-		 * extended attribute directories.
-		 */
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+		goto out;
+	}
 
-		if ((dzp->z_pflags & ZFS_XATTR) &&
-		    (vap->va_type != VREG)) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			error = SET_ERROR(EINVAL);
-			goto out;
-		}
+	/*
+	 * We only support the creation of regular files in
+	 * extended attribute directories.
+	 */
 
-		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
-		    cr, vsecp, &acl_ids)) != 0)
-			goto out;
-		have_acl = B_TRUE;
+	if ((dzp->z_pflags & ZFS_XATTR) &&
+	    (vap->va_type != VREG)) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
 
-		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-			zfs_acl_ids_free(&acl_ids);
-			error = SET_ERROR(EDQUOT);
-			goto out;
-		}
+	if ((error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
 
-		tx = dmu_tx_create(os);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
 
-		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
-		    ZFS_SA_BASE_ATTR_SIZE);
+	getnewvnode_reserve(1);
 
-		fuid_dirtied = zfsvfs->z_fuid_dirty;
-		if (fuid_dirtied)
-			zfs_fuid_txhold(zfsvfs, tx);
-		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
-		if (!zfsvfs->z_use_sa &&
-		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, acl_ids.z_aclp->z_acl_bytes);
-		}
-		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
-		if (error) {
-			zfs_dirent_unlock(dl);
-			if (error == ERESTART) {
-				waited = B_TRUE;
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto top;
-			}
-			zfs_acl_ids_free(&acl_ids);
-			dmu_tx_abort(tx);
-			getnewvnode_drop_reserve();
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+	tx = dmu_tx_create(os);
 
-		if (fuid_dirtied)
-			zfs_fuid_sync(zfsvfs, tx);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
 
-		(void) zfs_link_create(dl, zp, tx, ZNEW);
-		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
-		if (flag & FIGNORECASE)
-			txtype |= TX_CI;
-		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
-		    vsecp, acl_ids.z_fuidp, vap);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_commit(tx);
-	} else {
-		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
-
-		if (have_acl)
-			zfs_acl_ids_free(&acl_ids);
-		have_acl = B_FALSE;
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
-		/*
-		 * A directory entry already exists for this name.
-		 */
-		/*
-		 * Can't truncate an existing file if in exclusive mode.
-		 */
-		if (excl == EXCL) {
-			error = SET_ERROR(EEXIST);
-			goto out;
-		}
-		/*
-		 * Can't open a directory for writing.
-		 */
-		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
-			error = SET_ERROR(EISDIR);
-			goto out;
-		}
-		/*
-		 * Verify requested access to file.
-		 */
-		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
-			goto out;
-		}
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
 
-		mutex_enter(&dzp->z_lock);
-		dzp->z_seq++;
-		mutex_exit(&dzp->z_lock);
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+	    vsecp, acl_ids.z_fuidp, vap);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
 
-		/*
-		 * Truncate regular files if requested.
-		 */
-		if ((ZTOV(zp)->v_type == VREG) &&
-		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
-			/* we can't hold any locks when calling zfs_freesp() */
-			zfs_dirent_unlock(dl);
-			dl = NULL;
-			error = zfs_freesp(zp, 0, 0, mode, TRUE);
-			if (error == 0) {
-				vnevent_create(ZTOV(zp), ct);
-			}
-		}
-	}
-out:
 	getnewvnode_drop_reserve();
-	if (dl)
-		zfs_dirent_unlock(dl);
 
-	if (error) {
-		if (zp)
-			VN_RELE(ZTOV(zp));
-	} else {
+out:
+	if (error == 0) {
 		*vpp = ZTOV(zp);
-		error = specvp_check(vpp, cr);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
@@ -1932,57 +1912,30 @@ out:
  *	 vp - ctime (if nlink > 0)
  */
 
-uint64_t null_xattr = 0;
-
 /*ARGSUSED*/
 static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
-    int flags)
+zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
-	znode_t		*zp, *dzp = VTOZ(dvp);
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp = VTOZ(vp);
 	znode_t		*xzp;
-	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
-	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
-	pathname_t	*realnmp = NULL;
-	pathname_t	realnm;
 	int		error;
-	int		zflg = ZEXISTS;
-	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
+	zp = VTOZ(vp);
 
-	if (flags & FIGNORECASE) {
-		zflg |= ZCILOOK;
-		pn_alloc(&realnm);
-		realnmp = &realnm;
-	}
-
-top:
 	xattr_obj = 0;
 	xzp = NULL;
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, realnmp)) {
-		if (realnmp)
-			pn_free(realnmp);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
@@ -1998,14 +1951,15 @@ top:
 
 	vnevent_remove(vp, dvp, name, ct);
 
-	if (realnmp)
-		dnlc_remove(dvp, realnmp->pn_buf);
-	else
-		dnlc_remove(dvp, name);
+	obj = zp->z_id;
 
-	VI_LOCK(vp);
-	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
-	VI_UNLOCK(vp);
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
@@ -2013,35 +1967,17 @@ top:
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
-	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	if (may_delete_now) {
-		toobig =
-		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
-		/* if the file is too big, only hold_free a token amount */
-		dmu_tx_hold_free(tx, zp->z_id, 0,
-		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
-	}
 
-	/* are there any extended attributes? */
-	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-	    &xattr_obj, sizeof (xattr_obj));
-	if (error == 0 && xattr_obj) {
-		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT0(error);
+	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
-	mutex_enter(&zp->z_lock);
-	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
-		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-	mutex_exit(&zp->z_lock);
-
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
@@ -2050,20 +1986,8 @@ top:
 	 */
 	dmu_tx_mark_netfree(tx);
 
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (xzp)
-			VN_RELE(ZTOV(xzp));
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		if (realnmp)
-			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -2072,7 +1996,7 @@ top:
 	/*
 	 * Remove the directory entry.
 	 */
-	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
@@ -2080,76 +2004,18 @@ top:
 	}
 
 	if (unlinked) {
-		/*
-		 * Hold z_lock so that we can make sure that the ACL obj
-		 * hasn't changed.  Could have been deleted due to
-		 * zfs_sa_upgrade().
-		 */
-		mutex_enter(&zp->z_lock);
-		VI_LOCK(vp);
-		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
-		delete_now = may_delete_now && !toobig &&
-		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
-		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
-		    acl_obj;
-		VI_UNLOCK(vp);
-	}
-
-	if (delete_now) {
-#ifdef __FreeBSD__
-		panic("zfs_remove: delete_now branch taken");
-#endif
-		if (xattr_obj_unlinked) {
-			ASSERT3U(xzp->z_links, ==, 2);
-			mutex_enter(&xzp->z_lock);
-			xzp->z_unlinked = 1;
-			xzp->z_links = 0;
-			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
-			    &xzp->z_links, sizeof (xzp->z_links), tx);
-			ASSERT3U(error,  ==,  0);
-			mutex_exit(&xzp->z_lock);
-			zfs_unlinked_add(xzp, tx);
-
-			if (zp->z_is_sa)
-				error = sa_remove(zp->z_sa_hdl,
-				    SA_ZPL_XATTR(zfsvfs), tx);
-			else
-				error = sa_update(zp->z_sa_hdl,
-				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
-				    sizeof (uint64_t), tx);
-			ASSERT0(error);
-		}
-		VI_LOCK(vp);
-		vp->v_count--;
-		ASSERT0(vp->v_count);
-		VI_UNLOCK(vp);
-		mutex_exit(&zp->z_lock);
-		zfs_znode_delete(zp, tx);
-	} else if (unlinked) {
-		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
-#ifdef __FreeBSD__
 		vp->v_vflag |= VV_NOSYNC;
-#endif
 	}
 
 	txtype = TX_REMOVE;
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
-	if (realnmp)
-		pn_free(realnmp);
-
-	zfs_dirent_unlock(dl);
 
-	if (!delete_now)
-		VN_RELE(vp);
 	if (xzp)
-		VN_RELE(ZTOV(xzp));
+		vrele(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
@@ -2180,23 +2046,19 @@ out:
  */
 /*ARGSUSED*/
 static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
-    caller_context_t *ct, int flags, vsecattr_t *vsecp)
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zf = ZNEW;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
-	boolean_t	waited = B_FALSE;
 
 	ASSERT(vap->va_type == VDIR);
 
@@ -2211,7 +2073,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
-	    (vsecp || (vap->va_mask & AT_XVATTR) ||
+	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
@@ -2229,8 +2091,6 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
@@ -2241,13 +2101,11 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
-	    vsecp, &acl_ids)) != 0) {
+	    NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	getnewvnode_reserve(1);
-
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
@@ -2255,29 +2113,23 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
-top:
 	*vpp = NULL;
 
-	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
-	    NULL, NULL)) {
+	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
 		zfs_acl_ids_free(&acl_ids);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
+	ASSERT3P(zp, ==, NULL);
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
@@ -2285,6 +2137,7 @@ top:
 	/*
 	 * Add a new entry to the directory.
 	 */
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
@@ -2299,15 +2152,8 @@ top:
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
@@ -2326,14 +2172,12 @@ top:
 	/*
 	 * Now put new name in parent dir.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
+	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
-	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
-	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
@@ -2342,8 +2186,6 @@ top:
 
 	getnewvnode_drop_reserve();
 
-	zfs_dirent_unlock(dl);
-
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
@@ -2370,39 +2212,20 @@ top:
  */
 /*ARGSUSED*/
 static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
-	znode_t		*zp;
-	vnode_t		*vp;
+	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zflg = ZEXISTS;
-	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-top:
-	zp = NULL;
-
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, NULL)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
@@ -2413,25 +2236,8 @@ top:
 		goto out;
 	}
 
-	if (vp == cwd) {
-		error = SET_ERROR(EINVAL);
-		goto out;
-	}
-
 	vnevent_rmdir(vp, dvp, name, ct);
 
-	/*
-	 * Grab a lock on the directory to make sure that noone is
-	 * trying to add (or lookup) entries while we are removing it.
-	 */
-	rw_enter(&zp->z_name_lock, RW_WRITER);
-
-	/*
-	 * Grab a lock on the parent pointer to make sure we play well
-	 * with the treewalk and directory rename code.
-	 */
-	rw_enter(&zp->z_parent_lock, RW_WRITER);
-
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -2439,48 +2245,26 @@ top:
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		rw_exit(&zp->z_parent_lock);
-		rw_exit(&zp->z_name_lock);
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-#ifdef FREEBSD_NAMECACHE
 	cache_purge(dvp);
-#endif
 
-	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
-	rw_exit(&zp->z_parent_lock);
-	rw_exit(&zp->z_name_lock);
-#ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
-#endif
 out:
-	zfs_dirent_unlock(dl);
-
-	VN_RELE(vp);
-
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
@@ -2705,10 +2489,10 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
-				VN_RELE(ZTOV(ezp));
+				vrele(ZTOV(ezp));
 				goto skip_entry;
 			}
-			VN_RELE(ZTOV(ezp));
+			vrele(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
@@ -2905,7 +2689,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	 * than to determine whether we were asked the question.
 	 */
 
-	mutex_enter(&zp->z_lock);
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 #ifdef illumos
@@ -3042,7 +2825,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
-	mutex_exit(&zp->z_lock);
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
@@ -3178,7 +2960,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 		}
 	}
 
-top:
 	attrzp = NULL;
 	aclp = NULL;
 
@@ -3267,7 +3048,6 @@ top:
 		}
 	}
 
-	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
@@ -3341,7 +3121,6 @@ top:
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
@@ -3353,8 +3132,6 @@ top:
 		}
 	}
 
-	mutex_exit(&zp->z_lock);
-
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
@@ -3429,7 +3206,7 @@ top:
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
-					VN_RELE(ZTOV(attrzp));
+					vrele(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
@@ -3441,7 +3218,7 @@ top:
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
-					VN_RELE(ZTOV(attrzp));
+					vrele(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
@@ -3463,7 +3240,6 @@ top:
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
-		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
@@ -3484,7 +3260,6 @@ top:
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
-		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
@@ -3517,10 +3292,8 @@ top:
 	 * updated as a side-effect of calling this function.
 	 */
 
-
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
@@ -3528,7 +3301,6 @@ top:
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
-		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
@@ -3662,14 +3434,12 @@ top:
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
-	mutex_exit(&zp->z_lock);
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
-		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
@@ -3679,7 +3449,7 @@ out:
 	}
 
 	if (attrzp)
-		VN_RELE(ZTOV(attrzp));
+		vrele(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
@@ -3691,8 +3461,6 @@ out:
 
 	if (err) {
 		dmu_tx_abort(tx);
-		if (err == ERESTART)
-			goto top;
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
@@ -3706,101 +3474,236 @@ out2:
 	return (err);
 }
 
-typedef struct zfs_zlock {
-	krwlock_t	*zl_rwlock;	/* lock we acquired */
-	znode_t		*zl_znode;	/* znode we held */
-	struct zfs_zlock *zl_next;	/* next in list */
-} zfs_zlock_t;
-
 /*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
+ * We acquire all but fdvp locks using non-blocking acquisitions.  If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename.  This acquire/release step ensures that we do not
+ * spin on a lock waiting for release.  On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
  */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+    struct vnode *tdvp, struct vnode **tvpp,
+    const struct componentname *scnp, const struct componentname *tcnp)
 {
-	zfs_zlock_t *zl;
+	zfsvfs_t	*zfsvfs;
+	struct vnode	*nvp, *svp, *tvp;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	const char	*snm = scnp->cn_nameptr;
+	const char	*tnm = tcnp->cn_nameptr;
+	int error;
+
+	VOP_UNLOCK(tdvp, 0);
+	if (*tvpp != NULL && *tvpp != tdvp)
+		VOP_UNLOCK(*tvpp, 0);
 
-	while ((zl = *zlpp) != NULL) {
-		if (zl->zl_znode != NULL)
-			VN_RELE(ZTOV(zl->zl_znode));
-		rw_exit(zl->zl_rwlock);
-		*zlpp = zl->zl_next;
-		kmem_free(zl, sizeof (*zl));
+relock:
+	error = vn_lock(sdvp, LK_EXCLUSIVE);
+	if (error)
+		goto out;
+	sdzp = VTOZ(sdvp);
+
+	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+		if (error != EBUSY)
+			goto out;
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto out;
+		VOP_UNLOCK(tdvp, 0);
+		goto relock;
 	}
-}
+	tdzp = VTOZ(tdvp);
 
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
-	zfs_zlock_t	*zl;
-	znode_t		*zp = tdzp;
-	uint64_t	rootid = zp->z_zfsvfs->z_root;
-	uint64_t	oidp = zp->z_id;
-	krwlock_t	*rwlp = &szp->z_parent_lock;
-	krw_t		rw = RW_WRITER;
+	/*
+	 * Before using sdzp and tdzp we must ensure that they are live.
+	 * As a porting legacy from illumos we have two things to worry
+	 * about.  One is typical for FreeBSD and it is that the vnode is
+	 * not reclaimed (doomed).  The other is that the znode is live.
+	 * The current code can invalidate the znode without acquiring the
+	 * corresponding vnode lock if the object represented by the znode
+	 * and vnode is no longer valid after a rollback or receive operation.
+	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+	 * that protects the znodes from the invalidation.
+	 */
+	zfsvfs = sdzp->z_zfsvfs;
+	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+	ZFS_ENTER(zfsvfs);
 
 	/*
-	 * First pass write-locks szp and compares to zp->z_id.
-	 * Later passes read-lock zp and compare to zp->z_parent.
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
 	 */
-	do {
-		if (!rw_tryenter(rwlp, rw)) {
-			/*
-			 * Another thread is renaming in this path.
-			 * Note that if we are a WRITER, we don't have any
-			 * parent_locks held yet.
-			 */
-			if (rw == RW_READER && zp->z_id > szp->z_id) {
-				/*
-				 * Drop our locks and restart
-				 */
-				zfs_rename_unlock(&zl);
-				*zlpp = NULL;
-				zp = tdzp;
-				oidp = zp->z_id;
-				rwlp = &szp->z_parent_lock;
-				rw = RW_WRITER;
-				continue;
-			} else {
-				/*
-				 * Wait for other thread to drop its locks
-				 */
-				rw_enter(rwlp, rw);
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		error = SET_ERROR(EIO);
+		goto out;
+	}
+
+	/*
+	 * Re-resolve svp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+	if (error != 0) {
+		/* Source entry invalid or not there. */
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	svp = ZTOV(szp);
+
+	/*
+	 * Re-resolve tvp, if it disappeared we just carry on.
+	 */
+	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		vrele(svp);
+		if ((tcnp->cn_flags & ISDOTDOT) != 0)
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	if (tzp != NULL)
+		tvp = ZTOV(tzp);
+	else
+		tvp = NULL;
+
+	/*
+	 * At present the vnode locks must be acquired before z_teardown_lock,
+	 * although it would be more logical to use the opposite order.
+	 */
+	ZFS_EXIT(zfsvfs);
+
+	/*
+	 * Now try acquire locks on svp and tvp.
+	 */
+	nvp = svp;
+	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if (tvp != NULL)
+			vrele(tvp);
+		if (error != EBUSY) {
+			vrele(nvp);
+			goto out;
+		}
+		error = vn_lock(nvp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vrele(nvp);
+			goto out;
+		}
+		VOP_UNLOCK(nvp, 0);
+		/*
+		 * Concurrent rename race.
+		 * XXX ?
+		 */
+		if (nvp == tdvp) {
+			vrele(nvp);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+		vrele(*svpp);
+		*svpp = nvp;
+		goto relock;
+	}
+	vrele(*svpp);
+	*svpp = nvp;
+
+	if (*tvpp != NULL)
+		vrele(*tvpp);
+	*tvpp = NULL;
+	if (tvp != NULL) {
+		nvp = tvp;
+		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error != 0) {
+			VOP_UNLOCK(sdvp, 0);
+			VOP_UNLOCK(tdvp, 0);
+			VOP_UNLOCK(*svpp, 0);
+			if (error != EBUSY) {
+				vrele(nvp);
+				goto out;
+			}
+			error = vn_lock(nvp, LK_EXCLUSIVE);
+			if (error != 0) {
+				vrele(nvp);
+				goto out;
 			}
+			vput(nvp);
+			goto relock;
 		}
+		*tvpp = nvp;
+	}
 
-		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
-		zl->zl_rwlock = rwlp;
-		zl->zl_znode = NULL;
-		zl->zl_next = *zlpp;
-		*zlpp = zl;
+	return (0);
 
-		if (oidp == szp->z_id)		/* We're a descendant of szp */
-			return (SET_ERROR(EINVAL));
+out:
+	return (error);
+}
 
-		if (oidp == rootid)		/* We've hit the top */
-			return (0);
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*zp, *zp1;
+	uint64_t	parent;
+	int		error;
 
-		if (rw == RW_READER) {		/* i.e. not the first pass */
-			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
-			if (error)
-				return (error);
-			zl->zl_znode = zp;
+	zfsvfs = tdzp->z_zfsvfs;
+	if (tdzp == szp)
+		return (SET_ERROR(EINVAL));
+	if (tdzp == sdzp)
+		return (0);
+	if (tdzp->z_id == zfsvfs->z_root)
+		return (0);
+	zp = tdzp;
+	for (;;) {
+		ASSERT(!zp->z_unlinked);
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			break;
+
+		if (parent == szp->z_id) {
+			error = SET_ERROR(EINVAL);
+			break;
 		}
-		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
-		    &oidp, sizeof (oidp));
-		rwlp = &zp->z_parent_lock;
-		rw = RW_READER;
+		if (parent == zfsvfs->z_root)
+			break;
+		if (parent == sdzp->z_id)
+			break;
 
-	} while (zp->z_id != sdzp->z_id);
+		error = zfs_zget(zfsvfs, parent, &zp1);
+		if (error != 0)
+			break;
 
-	return (0);
+		if (zp != tdzp)
+			VN_RELE_ASYNC(ZTOV(zp),
+			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+		zp = zp1;
+	}
+
+	if (error == ENOTDIR)
+		panic("checkpath: .. not a directory\n");
+	if (zp != tdzp)
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+	return (error);
 }
 
 /*
@@ -3822,187 +3725,93 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
  */
 /*ARGSUSED*/
 static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+    cred_t *cr)
 {
-	znode_t		*tdzp, *sdzp, *szp, *tzp;
-	zfsvfs_t 	*zfsvfs;
-	zilog_t		*zilog;
-	vnode_t		*realvp;
-	zfs_dirlock_t	*sdl, *tdl;
+	zfsvfs_t	*zfsvfs;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	zilog_t		*zilog = NULL;
 	dmu_tx_t	*tx;
-	zfs_zlock_t	*zl;
-	int		cmp, serr, terr;
+	char		*snm = scnp->cn_nameptr;
+	char		*tnm = tcnp->cn_nameptr;
 	int		error = 0;
-	int		zflg = 0;
-	boolean_t	waited = B_FALSE;
 
-	tdzp = VTOZ(tdvp);
-	ZFS_VERIFY_ZP(tdzp);
-	zfsvfs = tdzp->z_zfsvfs;
-	ZFS_ENTER(zfsvfs);
-	zilog = zfsvfs->z_log;
-	sdzp = VTOZ(sdvp);
+	/* Reject renames across filesystems. */
+	if ((*svpp)->v_mount != tdvp->v_mount ||
+	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	if (zfsctl_is_node(tdvp)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
 
 	/*
-	 * In case sdzp is not valid, let's be sure to exit from the right
-	 * zfsvfs_t.
+	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
-	if (sdzp->z_sa_hdl == NULL) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EIO));
+	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+	if (error != 0) {
+		/* no vnodes are locked in the case of error here */
+		return (error);
 	}
 
+	tdzp = VTOZ(tdvp);
+	sdzp = VTOZ(sdvp);
+	zfsvfs = tdzp->z_zfsvfs;
+	zilog = zfsvfs->z_log;
+
 	/*
-	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
-	 * ctldir appear to have the same v_vfsp.
+	 * After we re-enter ZFS_ENTER() we will have to revalidate all
+	 * znodes involved.
 	 */
-	if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EXDEV));
-	}
+	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
+		error = SET_ERROR(EILSEQ);
+		goto unlockout;
 	}
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-
-top:
-	szp = NULL;
-	tzp = NULL;
-	zl = NULL;
-
-	/*
-	 * This is to prevent the creation of links into attribute space
-	 * by renaming a linked file into/outof an attribute directory.
-	 * See the comment in zfs_link() for why this is considered bad.
-	 */
-	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
+	/* If source and target are the same file, there is nothing to do. */
+	if ((*svpp) == (*tvpp)) {
+		error = 0;
+		goto unlockout;
 	}
 
-	/*
-	 * Lock source and target directory entries.  To prevent deadlock,
-	 * a lock ordering must be defined.  We lock the directory with
-	 * the smallest object id first, or if it's a tie, the one with
-	 * the lexically first name.
-	 */
-	if (sdzp->z_id < tdzp->z_id) {
-		cmp = -1;
-	} else if (sdzp->z_id > tdzp->z_id) {
-		cmp = 1;
-	} else {
-		/*
-		 * First compare the two name arguments without
-		 * considering any case folding.
-		 */
-		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
-
-		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
-		ASSERT(error == 0 || !zfsvfs->z_utf8);
-		if (cmp == 0) {
-			/*
-			 * POSIX: "If the old argument and the new argument
-			 * both refer to links to the same existing file,
-			 * the rename() function shall return successfully
-			 * and perform no other action."
-			 */
-			ZFS_EXIT(zfsvfs);
-			return (0);
-		}
-		/*
-		 * If the file system is case-folding, then we may
-		 * have some more checking to do.  A case-folding file
-		 * system is either supporting mixed case sensitivity
-		 * access or is completely case-insensitive.  Note
-		 * that the file system is always case preserving.
-		 *
-		 * In mixed sensitivity mode case sensitive behavior
-		 * is the default.  FIGNORECASE must be used to
-		 * explicitly request case insensitive behavior.
-		 *
-		 * If the source and target names provided differ only
-		 * by case (e.g., a request to rename 'tim' to 'Tim'),
-		 * we will treat this as a special case in the
-		 * case-insensitive mode: as long as the source name
-		 * is an exact match, we will allow this to proceed as
-		 * a name-change request.
-		 */
-		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
-		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
-		    flags & FIGNORECASE)) &&
-		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
-		    &error) == 0) {
-			/*
-			 * case preserving rename request, require exact
-			 * name matches
-			 */
-			zflg |= ZCIEXACT;
-			zflg &= ~ZCILOOK;
-		}
+	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+	    (*tvpp)->v_mountedhere != NULL)) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
 	}
 
 	/*
-	 * If the source and destination directories are the same, we should
-	 * grab the z_name_lock of that directory only once.
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
 	 */
-	if (sdzp == tdzp) {
-		zflg |= ZHAVELOCK;
-		rw_enter(&sdzp->z_name_lock, RW_READER);
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
 	}
 
-	if (cmp < 0) {
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
-		    ZEXISTS | zflg, NULL, NULL);
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
-	} else {
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, zflg, NULL, NULL);
-		serr = zfs_dirent_lock(&sdl,
-		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
-		    NULL, NULL);
+	szp = VTOZ(*svpp);
+	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
 	}
 
-	if (serr) {
-		/*
-		 * Source entry invalid or not there.
-		 */
-		if (!terr) {
-			zfs_dirent_unlock(tdl);
-			if (tzp)
-				VN_RELE(ZTOV(tzp));
-		}
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		/*
-		 * FreeBSD: In OpenSolaris they only check if rename source is
-		 * ".." here, because "." is handled in their lookup. This is
-		 * not the case for FreeBSD, so we check for "." explicitly.
-		 */
-		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
-			serr = SET_ERROR(EINVAL);
-		ZFS_EXIT(zfsvfs);
-		return (serr);
-	}
-	if (terr) {
-		zfs_dirent_unlock(sdl);
-		VN_RELE(ZTOV(szp));
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		if (strcmp(tnm, "..") == 0)
-			terr = SET_ERROR(EINVAL);
-		ZFS_EXIT(zfsvfs);
-		return (terr);
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		error = SET_ERROR(EINVAL);
+		goto unlockout;
 	}
 
 	/*
@@ -4011,17 +3820,26 @@ top:
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
-
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
-		goto out;
+		goto unlockout;
+
+	if ((*svpp)->v_type == VDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+		    sdzp == szp ||
+		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+			error = EINVAL;
+			goto unlockout;
+		}
 
-	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
-		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
-			goto out;
+		if (error = zfs_rename_check(szp, sdzp, tdzp))
+			goto unlockout;
 	}
 
 	/*
@@ -4031,31 +3849,26 @@ top:
 		/*
 		 * Source and target must be the same type.
 		 */
-		if (ZTOV(szp)->v_type == VDIR) {
-			if (ZTOV(tzp)->v_type != VDIR) {
+		if ((*svpp)->v_type == VDIR) {
+			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
-				goto out;
+				goto unlockout;
+			} else {
+				cache_purge(tdvp);
+				if (sdvp != tdvp)
+					cache_purge(sdvp);
 			}
 		} else {
-			if (ZTOV(tzp)->v_type == VDIR) {
+			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
-				goto out;
+				goto unlockout;
 			}
 		}
-		/*
-		 * POSIX dictates that when the source and target
-		 * entries refer to the same file object, rename
-		 * must do nothing and exit without error.
-		 */
-		if (szp->z_id == tzp->z_id) {
-			error = 0;
-			goto out;
-		}
 	}
 
-	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
-		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
@@ -4081,35 +3894,18 @@ top:
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (zl != NULL)
-			zfs_rename_unlock(&zl);
-		zfs_dirent_unlock(sdl);
-		zfs_dirent_unlock(tdl);
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		VN_RELE(ZTOV(szp));
-		if (tzp)
-			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
+		goto unlockout;
 	}
 
+
 	if (tzp)	/* Attempt to remove the existing target */
-		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
-		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
@@ -4117,17 +3913,16 @@ top:
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
-			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+			    NULL);
 			if (error == 0) {
-				zfs_log_rename(zilog, tx, TX_RENAME |
-				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
-				    sdl->dl_name, tdzp, tdl->dl_name, szp);
+				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+				    snm, tdzp, tnm, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
-				vn_renamepath(tdvp, ZTOV(szp), tnm,
-				    strlen(tnm));
+				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
@@ -4141,42 +3936,33 @@ top:
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
-				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
-#ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
-			cache_purge(sdvp);
-			cache_purge(tdvp);
-			cache_purge(ZTOV(szp));
-			if (tzp)
-				cache_purge(ZTOV(tzp));
+			cache_purge(*svpp);
+			if (*tvpp != NULL)
+				cache_purge(*tvpp);
+			cache_purge_negative(tdvp);
 		}
-#endif
 	}
 
 	dmu_tx_commit(tx);
-out:
-	if (zl != NULL)
-		zfs_rename_unlock(&zl);
 
-	zfs_dirent_unlock(sdl);
-	zfs_dirent_unlock(tdl);
-
-	if (sdzp == tdzp)
-		rw_exit(&sdzp->z_name_lock);
-
-
-	VN_RELE(ZTOV(szp));
-	if (tzp)
-		VN_RELE(ZTOV(tzp));
+unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
+	ZFS_EXIT(zfsvfs);
+	VOP_UNLOCK(*svpp, 0);
+	VOP_UNLOCK(sdvp, 0);
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+out:				/* original two vnodes are locked */
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
-	ZFS_EXIT(zfsvfs);
-
+	if (*tvpp != NULL)
+		VOP_UNLOCK(*tvpp, 0);
+	if (tdvp != *tvpp)
+		VOP_UNLOCK(tdvp, 0);
 	return (error);
 }
 
@@ -4201,17 +3987,14 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
-	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
-	boolean_t	waited = B_FALSE;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
@@ -4225,8 +4008,6 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
@@ -4239,35 +4020,29 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
 		return (error);
 	}
 
-	getnewvnode_reserve(1);
-
-top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
+
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
@@ -4281,15 +4056,8 @@ top:
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
@@ -4306,13 +4074,11 @@ top:
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
-	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
@@ -4320,10 +4086,8 @@ top:
 	/*
 	 * Insert the new object into the directory.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
@@ -4333,8 +4097,6 @@ top:
 
 	getnewvnode_drop_reserve();
 
-	zfs_dirent_unlock(dl);
-
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
@@ -4369,13 +4131,11 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
-	mutex_exit(&zp->z_lock);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
@@ -4407,14 +4167,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	vnode_t		*realvp;
 	int		error;
-	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
-	boolean_t	waited = B_FALSE;
 
 	ASSERT(tdvp->v_type == VDIR);
 
@@ -4422,9 +4178,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
-	if (VOP_REALVP(svp, &realvp, ct) == 0)
-		svp = realvp;
-
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
@@ -4442,15 +4195,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 		return (SET_ERROR(EPERM));
 	}
 
-	/*
-	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
-	 * ctldir appear to have the same v_vfsp.
-	 */
-	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EXDEV));
-	}
-
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
@@ -4468,8 +4212,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
@@ -4494,11 +4236,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 		return (error);
 	}
 
-top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -4509,33 +4250,22 @@ top:
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	error = zfs_link_create(dl, szp, tx, 0);
+	error = zfs_link_create(dzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
-
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
@@ -4547,235 +4277,6 @@ top:
 	return (error);
 }
 
-#ifdef illumos
-/*
- * zfs_null_putapage() is used when the file system has been force
- * unmounted. It just drops the pages.
- */
-/* ARGSUSED */
-static int
-zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
-    size_t *lenp, int flags, cred_t *cr)
-{
-	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
-	return (0);
-}
-
-/*
- * Push a page out to disk, klustering if possible.
- *
- *	IN:	vp	- file to push page to.
- *		pp	- page to push.
- *		flags	- additional flags.
- *		cr	- credentials of caller.
- *
- *	OUT:	offp	- start of range pushed.
- *		lenp	- len of range pushed.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * NOTE: callers must have locked the page to be pushed.  On
- * exit, the page (and all other pages in the kluster) must be
- * unlocked.
- */
-/* ARGSUSED */
-static int
-zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
-    size_t *lenp, int flags, cred_t *cr)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	dmu_tx_t	*tx;
-	u_offset_t	off, koff;
-	size_t		len, klen;
-	int		err;
-
-	off = pp->p_offset;
-	len = PAGESIZE;
-	/*
-	 * If our blocksize is bigger than the page size, try to kluster
-	 * multiple pages so that we write a full block (thus avoiding
-	 * a read-modify-write).
-	 */
-	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
-		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
-		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
-		ASSERT(koff <= zp->z_size);
-		if (koff + klen > zp->z_size)
-			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
-		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
-	}
-	ASSERT3U(btop(len), ==, btopr(len));
-
-	/*
-	 * Can't push pages past end-of-file.
-	 */
-	if (off >= zp->z_size) {
-		/* ignore all pages */
-		err = 0;
-		goto out;
-	} else if (off + len > zp->z_size) {
-		int npages = btopr(zp->z_size - off);
-		page_t *trunc;
-
-		page_list_break(&pp, &trunc, npages);
-		/* ignore pages past end of file */
-		if (trunc)
-			pvn_write_done(trunc, flags);
-		len = zp->z_size - off;
-	}
-
-	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
-	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
-		err = SET_ERROR(EDQUOT);
-		goto out;
-	}
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_write(tx, zp->z_id, off, len);
-
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		goto out;
-	}
-
-	if (zp->z_blksz <= PAGESIZE) {
-		caddr_t va = zfs_map_page(pp, S_READ);
-		ASSERT3U(len, <=, PAGESIZE);
-		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
-		zfs_unmap_page(pp, va);
-	} else {
-		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
-	}
-
-	if (err == 0) {
-		uint64_t mtime[2], ctime[2];
-		sa_bulk_attr_t bulk[3];
-		int count = 0;
-
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
-		    &mtime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-		    &ctime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-		    &zp->z_pflags, 8);
-		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
-		    B_TRUE);
-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
-	}
-	dmu_tx_commit(tx);
-
-out:
-	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
-	if (offp)
-		*offp = off;
-	if (lenp)
-		*lenp = len;
-
-	return (err);
-}
-
-/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
- *
- *	IN:	vp	- vnode of file to push page data to.
- *		off	- position in file to put data.
- *		len	- amount of data to write.
- *		flags	- flags to control the operation.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		*pp;
-	size_t		io_len;
-	u_offset_t	io_off;
-	uint_t		blksz;
-	rl_t		*rl;
-	int		error = 0;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * Align this request to the file block size in case we kluster.
-	 * XXX - this can result in pretty aggresive locking, which can
-	 * impact simultanious read/write access.  One option might be
-	 * to break up long requests (len == 0) into block-by-block
-	 * operations to get narrower locking.
-	 */
-	blksz = zp->z_blksz;
-	if (ISP2(blksz))
-		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
-	else
-		io_off = 0;
-	if (len > 0 && ISP2(blksz))
-		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
-	else
-		io_len = 0;
-
-	if (io_len == 0) {
-		/*
-		 * Search the entire vp list for pages >= io_off.
-		 */
-		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
-		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
-		goto out;
-	}
-	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
-
-	if (off > zp->z_size) {
-		/* past end of file */
-		zfs_range_unlock(rl);
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
-
-	for (off = io_off; io_off < off + len; io_off += io_len) {
-		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
-			pp = page_lookup(vp, io_off,
-			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
-		} else {
-			pp = page_lookup_nowait(vp, io_off,
-			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
-		}
-
-		if (pp != NULL && pvn_getdirty(pp, flags)) {
-			int err;
-
-			/*
-			 * Found a dirty page to push
-			 */
-			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
-			if (err)
-				error = err;
-		} else {
-			io_len = PAGESIZE;
-		}
-	}
-out:
-	zfs_range_unlock(rl);
-	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zfsvfs->z_log, zp->z_id);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-#endif	/* illumos */
 
 /*ARGSUSED*/
 void
@@ -4796,17 +4297,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 		return;
 	}
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
-		mutex_exit(&zp->z_lock);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
-	mutex_exit(&zp->z_lock);
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
@@ -4817,444 +4315,15 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
-			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
-#ifdef illumos
-/*
- * Bounds-check the seek operation.
- *
- *	IN:	vp	- vnode seeking within
- *		ooff	- old file offset
- *		noffp	- pointer to new file offset
- *		ct	- caller context
- *
- *	RETURN:	0 on success, EINVAL if new offset invalid.
- */
-/* ARGSUSED */
-static int
-zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
-    caller_context_t *ct)
-{
-	if (vp->v_type == VDIR)
-		return (0);
-	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
-
-/*
- * Pre-filter the generic locking function to trap attempts to place
- * a mandatory lock on a memory mapped file.
- */
-static int
-zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
-    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * We are following the UFS semantics with respect to mapcnt
-	 * here: If we see that the file is mapped already, then we will
-	 * return an error, but we don't worry about races between this
-	 * function and zfs_map().
-	 */
-	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EAGAIN));
-	}
-	ZFS_EXIT(zfsvfs);
-	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
-}
-
-/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data.  For efficiency, we may try to fill
- * multiple pages at once (klustering) to fill up the supplied page
- * list.  Note that the pages to be filled are held with an exclusive
- * lock to prevent access by other threads while they are being filled.
- */
-static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
-    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
-{
-	znode_t *zp = VTOZ(vp);
-	page_t *pp, *cur_pp;
-	objset_t *os = zp->z_zfsvfs->z_os;
-	u_offset_t io_off, total;
-	size_t io_len;
-	int err;
-
-	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
-		/*
-		 * We only have a single page, don't bother klustering
-		 */
-		io_off = off;
-		io_len = PAGESIZE;
-		pp = page_create_va(vp, io_off, io_len,
-		    PG_EXCL | PG_WAIT, seg, addr);
-	} else {
-		/*
-		 * Try to find enough pages to fill the page list
-		 */
-		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
-		    &io_len, off, plsz, 0);
-	}
-	if (pp == NULL) {
-		/*
-		 * The page already exists, nothing to do here.
-		 */
-		*pl = NULL;
-		return (0);
-	}
-
-	/*
-	 * Fill the pages in the kluster.
-	 */
-	cur_pp = pp;
-	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
-		caddr_t va;
-
-		ASSERT3U(io_off, ==, cur_pp->p_offset);
-		va = zfs_map_page(cur_pp, S_WRITE);
-		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
-		    DMU_READ_PREFETCH);
-		zfs_unmap_page(cur_pp, va);
-		if (err) {
-			/* On error, toss the entire kluster */
-			pvn_read_done(pp, B_ERROR);
-			/* convert checksum errors into IO errors */
-			if (err == ECKSUM)
-				err = SET_ERROR(EIO);
-			return (err);
-		}
-		cur_pp = cur_pp->p_next;
-	}
-
-	/*
-	 * Fill in the page list array from the kluster starting
-	 * from the desired offset `off'.
-	 * NOTE: the page list will always be null terminated.
-	 */
-	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
-	ASSERT(pl == NULL || (*pl)->p_offset == off);
-
-	return (0);
-}
-
-/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array.  If plsz is greater than len, this function may
- * also return page pointers from after the specified region
- * (i.e. the region [off, off + plsz]).  These additional pages are
- * only returned if they are already in the cache, or were created as
- * part of a klustered read.
- *
- *	IN:	vp	- vnode of file to get data from.
- *		off	- position in file to get data from.
- *		len	- amount of data to retrieve.
- *		plsz	- length of provided page list.
- *		seg	- segment to obtain pages for.
- *		addr	- virtual address of fault.
- *		rw	- mode of created pages.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	OUT:	protp	- protection mode of created pages.
- *		pl	- list of pages created.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
-    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
-    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		**pl0 = pl;
-	int		err = 0;
-
-	/* we do our own caching, faultahead is unnecessary */
-	if (pl == NULL)
-		return (0);
-	else if (len > plsz)
-		len = plsz;
-	else
-		len = P2ROUNDUP(len, PAGESIZE);
-	ASSERT(plsz >= len);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (protp)
-		*protp = PROT_ALL;
-
-	/*
-	 * Loop through the requested range [off, off + len) looking
-	 * for pages.  If we don't find a page, we will need to create
-	 * a new page and fill it with data from the file.
-	 */
-	while (len > 0) {
-		if (*pl = page_lookup(vp, off, SE_SHARED))
-			*(pl+1) = NULL;
-		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
-			goto out;
-		while (*pl) {
-			ASSERT3U((*pl)->p_offset, ==, off);
-			off += PAGESIZE;
-			addr += PAGESIZE;
-			if (len > 0) {
-				ASSERT3U(len, >=, PAGESIZE);
-				len -= PAGESIZE;
-			}
-			ASSERT3U(plsz, >=, PAGESIZE);
-			plsz -= PAGESIZE;
-			pl++;
-		}
-	}
-
-	/*
-	 * Fill out the page array with any pages already in the cache.
-	 */
-	while (plsz > 0 &&
-	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
-			off += PAGESIZE;
-			plsz -= PAGESIZE;
-	}
-out:
-	if (err) {
-		/*
-		 * Release any pages we have previously locked.
-		 */
-		while (pl > pl0)
-			page_unlock(*--pl);
-	} else {
-		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	}
-
-	*pl = NULL;
-
-	ZFS_EXIT(zfsvfs);
-	return (err);
-}
-
-/*
- * Request a memory map for a section of a file.  This code interacts
- * with common code and the VM system as follows:
- *
- * - common code calls mmap(), which ends up in smmap_common()
- * - this calls VOP_MAP(), which takes you into (say) zfs
- * - zfs_map() calls as_map(), passing segvn_create() as the callback
- * - segvn_create() creates the new segment and calls VOP_ADDMAP()
- * - zfs_addmap() updates z_mapcnt
- */
-/*ARGSUSED*/
-static int
-zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	segvn_crargs_t	vn_a;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if ((prot & PROT_WRITE) && (zp->z_pflags &
-	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	if ((prot & (PROT_READ | PROT_EXEC)) &&
-	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EACCES));
-	}
-
-	if (vp->v_flag & VNOMAP) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOSYS));
-	}
-
-	if (off < 0 || len > MAXOFFSET_T - off) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (vp->v_type != VREG) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENODEV));
-	}
-
-	/*
-	 * If file is locked, disallow mapping.
-	 */
-	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EAGAIN));
-	}
-
-	as_rangelock(as);
-	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
-	if (error != 0) {
-		as_rangeunlock(as);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vn_a.vp = vp;
-	vn_a.offset = (u_offset_t)off;
-	vn_a.type = flags & MAP_TYPE;
-	vn_a.prot = prot;
-	vn_a.maxprot = maxprot;
-	vn_a.cred = cr;
-	vn_a.amp = NULL;
-	vn_a.flags = flags & ~MAP_TYPE;
-	vn_a.szc = 0;
-	vn_a.lgrp_mem_policy_flags = 0;
-
-	error = as_map(as, *addrp, len, segvn_create, &vn_a);
-
-	as_rangeunlock(as);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
-	return (0);
-}
-
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file.  Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics.  If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed.  The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- *	open()
- *	mmap()
- *	<modify memory>
- *	munmap()
- *	close()
- *	<time lapse>
- *	putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future.  In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
-/* ARGSUSED */
-static int
-zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
-
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
-	return (0);
-}
-
-/*
- * Free or allocate space in a file.  Currently, this function only
- * supports the `F_FREESP' command.  However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- *	IN:	vp	- vnode of file to free data in.
- *		cmd	- action to take (only F_FREESP supported).
- *		bfp	- section of file to free/alloc.
- *		flag	- current file open mode flags.
- *		offset	- current file offset.
- *		cr	- credentials of caller [UNUSED].
- *		ct	- caller context.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
-/* ARGSUSED */
-static int
-zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
-    offset_t offset, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint64_t	off, len;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (cmd != F_FREESP) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
-	 * callers might not be able to detect properly that we are read-only,
-	 * so check it explicitly here.
-	 */
-	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EROFS));
-	}
-
-	if (error = convoff(vp, bfp, 0, offset)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (bfp->l_len < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	off = bfp->l_start;
-	len = bfp->l_len; /* 0 means from off to end of file */
-
-	error = zfs_freesp(zp, off, len, flag, TRUE);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-#endif	/* illumos */
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
@@ -5331,7 +4400,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
-	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
@@ -5349,13 +4417,12 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
-		error = zfs_dirent_lock(&dl, zp, "", &xzp,
-		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
+		error = zfs_dirent_lookup(zp, "", &xzp,
+		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
-			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
-			VN_RELE(ZTOV(xzp));
+			vrele(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
@@ -5448,339 +4515,6 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
 	return (error);
 }
 
-#ifdef illumos
-/*
- * The smallest read we may consider to loan out an arcbuf.
- * This must be a power of 2.
- */
-int zcr_blksz_min = (1 << 10);	/* 1K */
-/*
- * If set to less than the file block size, allow loaning out of an
- * arcbuf for a partial block read.  This must be a power of 2.
- */
-int zcr_blksz_max = (1 << 17);	/* 128K */
-
-/*ARGSUSED*/
-static int
-zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int max_blksz = zfsvfs->z_max_blksz;
-	uio_t *uio = &xuio->xu_uio;
-	ssize_t size = uio->uio_resid;
-	offset_t offset = uio->uio_loffset;
-	int blksz;
-	int fullblk, i;
-	arc_buf_t *abuf;
-	ssize_t maxsize;
-	int preamble, postamble;
-
-	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
-		return (SET_ERROR(EINVAL));
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-	switch (ioflag) {
-	case UIO_WRITE:
-		/*
-		 * Loan out an arc_buf for write if write size is bigger than
-		 * max_blksz, and the file's block size is also max_blksz.
-		 */
-		blksz = max_blksz;
-		if (size < blksz || zp->z_blksz != blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-		/*
-		 * Caller requests buffers for write before knowing where the
-		 * write offset might be (e.g. NFS TCP write).
-		 */
-		if (offset == -1) {
-			preamble = 0;
-		} else {
-			preamble = P2PHASE(offset, blksz);
-			if (preamble) {
-				preamble = blksz - preamble;
-				size -= preamble;
-			}
-		}
-
-		postamble = P2PHASE(size, blksz);
-		size -= postamble;
-
-		fullblk = size / blksz;
-		(void) dmu_xuio_init(xuio,
-		    (preamble != 0) + fullblk + (postamble != 0));
-		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
-		    int, postamble, int,
-		    (preamble != 0) + fullblk + (postamble != 0));
-
-		/*
-		 * Have to fix iov base/len for partial buffers.  They
-		 * currently represent full arc_buf's.
-		 */
-		if (preamble) {
-			/* data begins in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf,
-			    blksz - preamble, preamble);
-		}
-
-		for (i = 0; i < fullblk; i++) {
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
-		}
-
-		if (postamble) {
-			/* data ends in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
-		}
-		break;
-	case UIO_READ:
-		/*
-		 * Loan out an arc_buf for read if the read size is larger than
-		 * the current file block size.  Block alignment is not
-		 * considered.  Partial arc_buf will be loaned out for read.
-		 */
-		blksz = zp->z_blksz;
-		if (blksz < zcr_blksz_min)
-			blksz = zcr_blksz_min;
-		if (blksz > zcr_blksz_max)
-			blksz = zcr_blksz_max;
-		/* avoid potential complexity of dealing with it */
-		if (blksz > max_blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-
-		maxsize = zp->z_size - uio->uio_loffset;
-		if (size > maxsize)
-			size = maxsize;
-
-		if (size < blksz || vn_has_cached_data(vp)) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-		break;
-	default:
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	uio->uio_extflg = UIO_XUIO;
-	XUIO_XUZC_RW(xuio) = ioflag;
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
-{
-	int i;
-	arc_buf_t *abuf;
-	int ioflag = XUIO_XUZC_RW(xuio);
-
-	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
-	i = dmu_xuio_cnt(xuio);
-	while (i-- > 0) {
-		abuf = dmu_xuio_arcbuf(xuio, i);
-		/*
-		 * if abuf == NULL, it must be a write buffer
-		 * that has been returned in zfs_write().
-		 */
-		if (abuf)
-			dmu_return_arcbuf(abuf);
-		ASSERT(abuf || ioflag == UIO_WRITE);
-	}
-
-	dmu_xuio_fini(xuio);
-	return (0);
-}
-
-/*
- * Predeclare these here so that the compiler assumes that
- * this is an "old style" function declaration that does
- * not include arguments => we won't get type mismatch errors
- * in the initializations that follow.
- */
-static int zfs_inval();
-static int zfs_isdir();
-
-static int
-zfs_inval()
-{
-	return (SET_ERROR(EINVAL));
-}
-
-static int
-zfs_isdir()
-{
-	return (SET_ERROR(EISDIR));
-}
-/*
- * Directory vnode operations template
- */
-vnodeops_t *zfs_dvnodeops;
-const fs_operation_def_t zfs_dvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_READ,		{ .error = zfs_isdir },
-	VOPNAME_WRITE,		{ .error = zfs_isdir },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_CREATE,		{ .vop_create = zfs_create },
-	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
-	VOPNAME_LINK,		{ .vop_link = zfs_link },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
-	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
-	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
-	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Regular file vnode operations template
- */
-vnodeops_t *zfs_fvnodeops;
-const fs_operation_def_t zfs_fvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_READ,		{ .vop_read = zfs_read },
-	VOPNAME_WRITE,		{ .vop_write = zfs_write },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
-	VOPNAME_SPACE,		{ .vop_space = zfs_space },
-	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
-	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
-	VOPNAME_MAP,		{ .vop_map = zfs_map },
-	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
-	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
-	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
-	NULL,			NULL
-};
-
-/*
- * Symbolic link vnode operations template
- */
-vnodeops_t *zfs_symvnodeops;
-const fs_operation_def_t zfs_symvnodeops_template[] = {
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * special share hidden files vnode operations template
- */
-vnodeops_t *zfs_sharevnodeops;
-const fs_operation_def_t zfs_sharevnodeops_template[] = {
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Extended attribute directory vnode operations template
- *
- * This template is identical to the directory vnodes
- * operation template except for restricted operations:
- *	VOP_MKDIR()
- *	VOP_SYMLINK()
- *
- * Note that there are other restrictions embedded in:
- *	zfs_create()	- restrict type to VREG
- *	zfs_link()	- no links into/out of attribute space
- *	zfs_rename()	- no moves into/out of attribute space
- */
-vnodeops_t *zfs_xdvnodeops;
-const fs_operation_def_t zfs_xdvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_CREATE,		{ .vop_create = zfs_create },
-	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
-	VOPNAME_LINK,		{ .vop_link = zfs_link },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_MKDIR,		{ .error = zfs_inval },
-	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
-	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
-	VOPNAME_SYMLINK,	{ .error = zfs_inval },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Error vnode operations template
- */
-vnodeops_t *zfs_evnodeops;
-const fs_operation_def_t zfs_evnodeops_template[] = {
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	NULL,			NULL
-};
-#endif	/* illumos */
-
 static int
 ioflags(int ioflags)
 {
@@ -5789,7 +4523,7 @@ ioflags(int ioflags)
 	if (ioflags & IO_APPEND)
 		flags |= FAPPEND;
 	if (ioflags & IO_NDELAY)
-        	flags |= FNONBLOCK;
+		flags |= FNONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= (FSYNC | FDSYNC | FRSYNC);
 
@@ -6257,6 +4991,23 @@ zfs_freebsd_lookup(ap)
 }
 
 static int
+zfs_cache_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	zfsvfs_t *zfsvfs;
+
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+	if (zfsvfs->z_use_namecache)
+		return (vfs_cache_lookup(ap));
+	else
+		return (zfs_freebsd_lookup(ap));
+}
+
+static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
@@ -6265,6 +5016,7 @@ zfs_freebsd_create(ap)
 		struct vattr *a_vap;
 	} */ *ap;
 {
+	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int error, mode;
@@ -6273,13 +5025,13 @@ zfs_freebsd_create(ap)
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 
 	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
-#ifdef FREEBSD_NAMECACHE
-	if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+	if (zfsvfs->z_use_namecache &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
-#endif
 	return (error);
 }
 
@@ -6294,8 +5046,8 @@ zfs_freebsd_remove(ap)
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
-	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred, NULL, 0));
+	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+	    ap->a_cnp->cn_cred));
 }
 
 static int
@@ -6314,7 +5066,7 @@ zfs_freebsd_mkdir(ap)
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
-	    ap->a_cnp->cn_cred, NULL, 0, NULL));
+	    ap->a_cnp->cn_cred));
 }
 
 static int
@@ -6329,7 +5081,7 @@ zfs_freebsd_rmdir(ap)
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
-	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
+	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
@@ -6563,23 +5315,14 @@ zfs_freebsd_rename(ap)
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
-	/*
-	 * Check for cross-device rename.
-	 */
-	if ((fdvp->v_mount != tdvp->v_mount) ||
-	    (tvp && (fdvp->v_mount != tvp->v_mount)))
-		error = EXDEV;
-	else
-		error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
-		    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
-	if (tdvp == tvp)
-		VN_RELE(tdvp);
-	else
-		VN_URELE(tdvp);
-	if (tvp)
-		VN_URELE(tvp);
-	VN_RELE(fdvp);
-	VN_RELE(fvp);
+	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+	    ap->a_tcnp, ap->a_fcnp->cn_cred);
+
+	vrele(fdvp);
+	vrele(fvp);
+	vrele(tdvp);
+	if (tvp != NULL)
+		vrele(tvp);
 
 	return (error);
 }
@@ -7250,6 +5993,39 @@ zfs_vptocnp(struct vop_vptocnp_args *ap)
 	return (error);
 }
 
+#ifdef DIAGNOSTIC
+static int
+zfs_lock(ap)
+	struct vop_lock1_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		char *file;
+		int line;
+	} */ *ap;
+{
+	zfsvfs_t *zfsvfs;
+	znode_t *zp;
+	vnode_t *vp;
+	int flags;
+	int err;
+
+	vp = ap->a_vp;
+	flags = ap->a_flags;
+	if ((flags & LK_INTERLOCK) == 0 && (flags & LK_NOWAIT) == 0 &&
+	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
+		zfsvfs = zp->z_zfsvfs;
+		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+	}
+	err = vop_stdlock(ap);
+	if ((flags & LK_INTERLOCK) != 0 && (flags & LK_NOWAIT) == 0 &&
+	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
+		zfsvfs = zp->z_zfsvfs;
+		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+	}
+	return (err);
+}
+#endif
+
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
@@ -7259,12 +6035,8 @@ struct vop_vector zfs_vnodeops = {
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
-#ifdef FREEBSD_NAMECACHE
-	.vop_lookup =		vfs_cache_lookup,
+	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
-#else
-	.vop_lookup =		zfs_freebsd_lookup,
-#endif
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
@@ -7296,6 +6068,9 @@ struct vop_vector zfs_vnodeops = {
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 	.vop_vptocnp =		zfs_vptocnp,
+#ifdef DIAGNOSTIC
+	.vop_lock1 =		zfs_lock,
+#endif
 };
 
 struct vop_vector zfs_fifoops = {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index 3853838..c947e54 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -124,16 +124,12 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 
 	list_link_init(&zp->z_link_node);
 
-	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
-	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
 	zp->z_vnode = NULL;
 	zp->z_moved = 0;
@@ -150,14 +146,10 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	ASSERT(ZTOV(zp) == NULL);
 	vn_free(ZTOV(zp));
 	ASSERT(!list_link_active(&zp->z_link_node));
-	mutex_destroy(&zp->z_lock);
-	rw_destroy(&zp->z_parent_lock);
-	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	avl_destroy(&zp->z_range_avl);
 	mutex_destroy(&zp->z_range_lock);
 
-	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 }
 
@@ -559,8 +551,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
-	mutex_enter(&zp->z_lock);
-
 	ASSERT(zp->z_sa_hdl == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 	if (sa_hdl == NULL) {
@@ -580,7 +570,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
 		ZTOV(zp)->v_flag |= VROOT;
 
-	mutex_exit(&zp->z_lock);
 	vn_exists(ZTOV(zp));
 }
 
@@ -637,7 +626,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_vnode = vp;
 	vp->v_data = zp;
 
-	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	zp->z_moved = 0;
 
@@ -739,7 +727,14 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	/*
 	 * Acquire vnode lock before making it available to the world.
 	 */
+#ifdef DIAGNOSTIC
+	vop_lock1_t *orig_lock = vp->v_op->vop_lock1;
+	vp->v_op->vop_lock1 = vop_stdlock;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	vp->v_op->vop_lock1 = orig_lock;
+#else
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#endif
 	VN_LOCK_AREC(vp);
 	if (vp->v_type != VFIFO)
 		VN_LOCK_ASHARE(vp);
@@ -1161,54 +1156,55 @@ again:
 	if (hdl != NULL) {
 		zp  = sa_get_userdata(hdl);
 
-
 		/*
 		 * Since "SA" does immediate eviction we
 		 * should never find a sa handle that doesn't
 		 * know about the znode.
 		 */
-
 		ASSERT3P(zp, !=, NULL);
-
-		mutex_enter(&zp->z_lock);
 		ASSERT3U(zp->z_id, ==, obj_num);
-		if (zp->z_unlinked) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			vp = ZTOV(zp);
-			*zpp = zp;
-			err = 0;
-		}
+		*zpp = zp;
+		vp = ZTOV(zp);
 
 		/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
-		if (err == 0)
-			VN_HOLD(vp);
+		VN_HOLD(vp);
 
-		mutex_exit(&zp->z_lock);
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
-		if (err == 0) {
-			locked = VOP_ISLOCKED(vp);
-			VI_LOCK(vp);
-			if ((vp->v_iflag & VI_DOOMED) != 0 &&
-			    locked != LK_EXCLUSIVE) {
-				/*
-				 * The vnode is doomed and this thread doesn't
-				 * hold the exclusive lock on it, so the vnode
-				 * must be being reclaimed by another thread.
-				 * Otherwise the doomed vnode is being reclaimed
-				 * by this thread and zfs_zget is called from
-				 * ZIL internals.
-				 */
-				VI_UNLOCK(vp);
-				VN_RELE(vp);
-				goto again;
-			}
+		locked = VOP_ISLOCKED(vp);
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0 &&
+		    locked != LK_EXCLUSIVE) {
+			/*
+			 * The vnode is doomed and this thread doesn't
+			 * hold the exclusive lock on it, so the vnode
+			 * must be being reclaimed by another thread.
+			 * Otherwise the doomed vnode is being reclaimed
+			 * by this thread and zfs_zget is called from
+			 * ZIL internals.
+			 */
 			VI_UNLOCK(vp);
+
+			/*
+			 * XXX vrele() locks the vnode when the last reference
+			 * is dropped.  Although in this case the vnode is
+			 * doomed / dead and so no inactivation is required,
+			 * the vnode lock is still acquired.  That could result
+			 * in a LOR with z_teardown_lock if another thread holds
+			 * the vnode's lock and tries to take z_teardown_lock.
+			 * But that is only possible if the other thread peforms
+			 * a ZFS vnode operation on the vnode.  That either
+			 * should not happen if the vnode is dead or the thread
+			 * should also have a refrence to the vnode and thus
+			 * our reference is not last.
+			 */
+			VN_RELE(vp);
+			goto again;
 		}
+		VI_UNLOCK(vp);
 		getnewvnode_drop_reserve();
-		return (err);
+		return (0);
 	}
 
 	/*
@@ -1391,20 +1387,16 @@ zfs_zinactive(znode_t *zp)
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
-	mutex_enter(&zp->z_lock);
-
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
-		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		zfs_rmnode(zp);
 		return;
 	}
 
-	mutex_exit(&zp->z_lock);
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index e6b1e90..0da005a 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -290,7 +290,10 @@ dev/lindev/full.c		optional	lindev
 dev/lindev/lindev.c		optional	lindev
 dev/nfe/if_nfe.c		optional	nfe pci
 dev/ntb/if_ntb/if_ntb.c		optional	if_ntb
-dev/ntb/ntb_hw/ntb_hw.c		optional	if_ntb ntb_hw
+dev/ntb/ntb_transport.c		optional	if_ntb
+dev/ntb/ntb.c			optional	if_ntb | ntb_hw
+dev/ntb/ntb_if.m		optional	if_ntb | ntb_hw
+dev/ntb/ntb_hw/ntb_hw.c		optional	ntb_hw
 dev/nvd/nvd.c			optional	nvd nvme
 dev/nve/if_nve.c		optional	nve pci
 dev/nvme/nvme.c			optional	nvme
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index fe93e68..e46fe53 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -286,7 +286,10 @@ dev/mse/mse.c			optional mse
 dev/mse/mse_isa.c		optional mse isa
 dev/nfe/if_nfe.c		optional nfe pci
 dev/ntb/if_ntb/if_ntb.c		optional if_ntb
-dev/ntb/ntb_hw/ntb_hw.c		optional if_ntb | ntb_hw
+dev/ntb/ntb_transport.c		optional if_ntb
+dev/ntb/ntb.c			optional if_ntb | ntb_hw
+dev/ntb/ntb_if.m		optional if_ntb | ntb_hw
+dev/ntb/ntb_hw/ntb_hw.c		optional ntb_hw
 dev/nvd/nvd.c			optional nvd nvme
 dev/nve/if_nve.c		optional nve pci
 dev/nvme/nvme.c			optional nvme
diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c
index 9db1c44..1295de7 100644
--- a/sys/dev/ahci/ahci.c
+++ b/sys/dev/ahci/ahci.c
@@ -373,7 +373,8 @@ ahci_setup_interrupt(device_t dev)
 		else if (ctlr->numirqs == 1 || i >= ctlr->channels ||
 		    (ctlr->ccc && i == ctlr->cccv))
 			ctlr->irqs[i].mode = AHCI_IRQ_MODE_ALL;
-		else if (i == ctlr->numirqs - 1)
+		else if (ctlr->channels > ctlr->numirqs &&
+		    i == ctlr->numirqs - 1)
 			ctlr->irqs[i].mode = AHCI_IRQ_MODE_AFTER;
 		else
 			ctlr->irqs[i].mode = AHCI_IRQ_MODE_ONE;
@@ -422,6 +423,7 @@ ahci_intr(void *data)
 	} else {	/* AHCI_IRQ_MODE_AFTER */
 		unit = irq->r_irq_rid - 1;
 		is = ATA_INL(ctlr->r_mem, AHCI_IS);
+		is &= (0xffffffff << unit);
 	}
 	/* CCC interrupt is edge triggered. */
 	if (ctlr->ccc)
diff --git a/sys/dev/ahci/ahci_pci.c b/sys/dev/ahci/ahci_pci.c
index 22f28e2..bb14ed6 100644
--- a/sys/dev/ahci/ahci_pci.c
+++ b/sys/dev/ahci/ahci_pci.c
@@ -187,7 +187,7 @@ static const struct {
 	{0xa10f8086, 0x00, "Intel Sunrise Point (RAID)",	0},
 	{0x23238086, 0x00, "Intel DH89xxCC",	0},
 	{0x2360197b, 0x00, "JMicron JMB360",	0},
-	{0x2361197b, 0x00, "JMicron JMB361",	AHCI_Q_NOFORCE},
+	{0x2361197b, 0x00, "JMicron JMB361",	AHCI_Q_NOFORCE | AHCI_Q_1CH},
 	{0x2362197b, 0x00, "JMicron JMB362",	0},
 	{0x2363197b, 0x00, "JMicron JMB363",	AHCI_Q_NOFORCE},
 	{0x2365197b, 0x00, "JMicron JMB365",	AHCI_Q_NOFORCE},
diff --git a/sys/dev/e1000/e1000_api.c b/sys/dev/e1000/e1000_api.c
index 28379cc..52e2609 100644
--- a/sys/dev/e1000/e1000_api.c
+++ b/sys/dev/e1000/e1000_api.c
@@ -304,6 +304,10 @@ s32 e1000_set_mac_type(struct e1000_hw *hw)
 	case E1000_DEV_ID_PCH_SPT_I219_LM2:
 	case E1000_DEV_ID_PCH_SPT_I219_V2:
 	case E1000_DEV_ID_PCH_LBG_I219_LM3:
+	case E1000_DEV_ID_PCH_SPT_I219_LM4:
+	case E1000_DEV_ID_PCH_SPT_I219_V4:
+	case E1000_DEV_ID_PCH_SPT_I219_LM5:
+	case E1000_DEV_ID_PCH_SPT_I219_V5:
 		mac->type = e1000_pch_spt;
 		break;
 	case E1000_DEV_ID_82575EB_COPPER:
diff --git a/sys/dev/e1000/e1000_hw.h b/sys/dev/e1000/e1000_hw.h
index 1792e14..e1464a7 100644
--- a/sys/dev/e1000/e1000_hw.h
+++ b/sys/dev/e1000/e1000_hw.h
@@ -142,6 +142,10 @@ struct e1000_hw;
 #define E1000_DEV_ID_PCH_SPT_I219_LM2		0x15B7 /* Sunrise Point-H PCH */
 #define E1000_DEV_ID_PCH_SPT_I219_V2		0x15B8 /* Sunrise Point-H PCH */
 #define E1000_DEV_ID_PCH_LBG_I219_LM3		0x15B9 /* LEWISBURG PCH */
+#define E1000_DEV_ID_PCH_SPT_I219_LM4		0x15D7
+#define E1000_DEV_ID_PCH_SPT_I219_V4		0x15D8
+#define E1000_DEV_ID_PCH_SPT_I219_LM5		0x15E3
+#define E1000_DEV_ID_PCH_SPT_I219_V5		0x15D6
 #define E1000_DEV_ID_82576			0x10C9
 #define E1000_DEV_ID_82576_FIBER		0x10E6
 #define E1000_DEV_ID_82576_SERDES		0x10E7
@@ -957,9 +961,13 @@ struct e1000_dev_spec_ich8lan {
 	E1000_MUTEX nvm_mutex;
 	E1000_MUTEX swflag_mutex;
 	bool nvm_k1_enabled;
+	bool disable_k1_off;
 	bool eee_disable;
 	u16 eee_lp_ability;
 	enum e1000_ulp_state ulp_state;
+	bool ulp_capability_disabled;
+	bool during_suspend_flow;
+	bool during_dpg_exit;
 };
 
 struct e1000_dev_spec_82575 {
diff --git a/sys/dev/e1000/e1000_ich8lan.c b/sys/dev/e1000/e1000_ich8lan.c
index 9b9a090..4c93662 100644
--- a/sys/dev/e1000/e1000_ich8lan.c
+++ b/sys/dev/e1000/e1000_ich8lan.c
@@ -288,7 +288,7 @@ static void e1000_toggle_lanphypc_pch_lpt(struct e1000_hw *hw)
 	mac_reg &= ~E1000_CTRL_LANPHYPC_VALUE;
 	E1000_WRITE_REG(hw, E1000_CTRL, mac_reg);
 	E1000_WRITE_FLUSH(hw);
-	usec_delay(10);
+	msec_delay(1);
 	mac_reg &= ~E1000_CTRL_LANPHYPC_OVERRIDE;
 	E1000_WRITE_REG(hw, E1000_CTRL, mac_reg);
 	E1000_WRITE_FLUSH(hw);
@@ -1625,7 +1625,17 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 			hw->phy.ops.write_reg_locked(hw,
 						     I217_PLL_CLOCK_GATE_REG,
 						     phy_reg);
-		}
+
+			if (speed == SPEED_1000) {
+				hw->phy.ops.read_reg_locked(hw, HV_PM_CTRL,
+							    &phy_reg);
+
+				phy_reg |= HV_PM_CTRL_K1_CLK_REQ;
+
+				hw->phy.ops.write_reg_locked(hw, HV_PM_CTRL,
+							     phy_reg);
+				}
+		 }
 		hw->phy.ops.release(hw);
 
 		if (ret_val)
@@ -1718,7 +1728,8 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 		u32 pcieanacfg = E1000_READ_REG(hw, E1000_PCIEANACFG);
 		u32 fextnvm6 = E1000_READ_REG(hw, E1000_FEXTNVM6);
 
-		if (pcieanacfg & E1000_FEXTNVM6_K1_OFF_ENABLE)
+		if ((pcieanacfg & E1000_FEXTNVM6_K1_OFF_ENABLE) &&
+			(hw->dev_spec.ich8lan.disable_k1_off == FALSE))
 			fextnvm6 |= E1000_FEXTNVM6_K1_OFF_ENABLE;
 		else
 			fextnvm6 &= ~E1000_FEXTNVM6_K1_OFF_ENABLE;
diff --git a/sys/dev/e1000/e1000_ich8lan.h b/sys/dev/e1000/e1000_ich8lan.h
index edc1dd1..6d81291 100644
--- a/sys/dev/e1000/e1000_ich8lan.h
+++ b/sys/dev/e1000/e1000_ich8lan.h
@@ -239,7 +239,7 @@
 
 /* PHY Power Management Control */
 #define HV_PM_CTRL		PHY_REG(770, 17)
-#define HV_PM_CTRL_PLL_STOP_IN_K1_GIGA	0x100
+#define HV_PM_CTRL_K1_CLK_REQ		0x200
 #define HV_PM_CTRL_K1_ENABLE		0x4000
 
 #define I217_PLL_CLOCK_GATE_REG	PHY_REG(772, 28)
diff --git a/sys/dev/e1000/e1000_phy.c b/sys/dev/e1000/e1000_phy.c
index b2bec3e..9684b43 100644
--- a/sys/dev/e1000/e1000_phy.c
+++ b/sys/dev/e1000/e1000_phy.c
@@ -4148,10 +4148,10 @@ s32 e1000_read_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 *data)
 	/* Disable access to mPHY if it was originally disabled */
 	if (locked)
 		ready = e1000_is_mphy_ready(hw);
-		if (!ready)
-			return -E1000_ERR_PHY;
-		E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
-				E1000_MPHY_DIS_ACCESS);
+	if (!ready)
+		return -E1000_ERR_PHY;
+	E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
+			E1000_MPHY_DIS_ACCESS);
 
 	return E1000_SUCCESS;
 }
@@ -4213,10 +4213,10 @@ s32 e1000_write_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 data,
 	/* Disable access to mPHY if it was originally disabled */
 	if (locked)
 		ready = e1000_is_mphy_ready(hw);
-		if (!ready)
-			return -E1000_ERR_PHY;
-		E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
-				E1000_MPHY_DIS_ACCESS);
+	if (!ready)
+		return -E1000_ERR_PHY;
+	E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
+			E1000_MPHY_DIS_ACCESS);
 
 	return E1000_SUCCESS;
 }
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index 46f3e48..6b6b791 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -192,6 +192,12 @@ static em_vendor_info_t em_vendor_info_array[] =
 	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_V2, PCI_ANY_ID, PCI_ANY_ID, 0},
 	{ 0x8086, E1000_DEV_ID_PCH_LBG_I219_LM3,
 						PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_LM4,
+						PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_V4, PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_LM5,
+						PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_V5, PCI_ANY_ID, PCI_ANY_ID, 0},
 	/* required last entry */
 	{ 0, 0, 0, 0, 0}
 };
diff --git a/sys/dev/filemon/filemon.c b/sys/dev/filemon/filemon.c
index 919af9d..26e1bc3 100644
--- a/sys/dev/filemon/filemon.c
+++ b/sys/dev/filemon/filemon.c
@@ -137,6 +137,8 @@ filemon_proc_get(struct proc *p)
 {
 	struct filemon *filemon;
 
+	if (p->p_filemon == NULL)
+		return (NULL);
 	PROC_LOCK(p);
 	filemon = filemon_acquire(p->p_filemon);
 	PROC_UNLOCK(p);
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
index 936e4e1..18626cb 100644
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -810,6 +810,7 @@ hv_storvsc_on_iocompletion(struct storvsc_softc *sc,
 	 * because the fields will be used later in storvsc_io_done().
 	 */
 	request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status;
+	request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status;
 	request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len;
 
 	if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) &&
@@ -1945,28 +1946,6 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
 	return(0);
 }
 
-/*
- * SCSI Inquiry checks qualifier and type.
- * If qualifier is 011b, means the device server is not capable
- * of supporting a peripheral device on this logical unit, and
- * the type should be set to 1Fh.
- * 
- * Return 1 if it is valid, 0 otherwise.
- */
-static inline int
-is_inquiry_valid(const struct scsi_inquiry_data *inq_data)
-{
-	uint8_t type;
-	if (SID_QUAL(inq_data) != SID_QUAL_LU_CONNECTED) {
-		return (0);
-	}
-	type = SID_TYPE(inq_data);
-	if (type == T_NODEVICE) {
-		return (0);
-	}
-	return (1);
-}
-
 /**
  * @brief completion function before returning to CAM
  *
@@ -1985,7 +1964,6 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
 	struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
 	bus_dma_segment_t *ori_sglist = NULL;
 	int ori_sg_count = 0;
-
 	/* destroy bounce buffer if it is used */
 	if (reqp->bounce_sgl_count) {
 		ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
@@ -2040,88 +2018,71 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 	if (vm_srb->scsi_status == SCSI_STATUS_OK) {
 		const struct scsi_generic *cmd;
-		/*
-		 * Check whether the data for INQUIRY cmd is valid or
-		 * not.  Windows 10 and Windows 2016 send all zero
-		 * inquiry data to VM even for unpopulated slots.
-		 */
+
+		if (vm_srb->srb_status != SRB_STATUS_SUCCESS) {
+			if (vm_srb->srb_status == SRB_STATUS_INVALID_LUN) {
+				xpt_print(ccb->ccb_h.path, "invalid LUN %d\n",
+				    vm_srb->lun);
+			} else {
+				xpt_print(ccb->ccb_h.path, "Unknown SRB flag: %d\n",
+				    vm_srb->srb_status);
+			}
+			/*
+			 * If there are errors, for example, invalid LUN,
+			 * host will inform VM through SRB status.
+			 */
+			ccb->ccb_h.status |= CAM_SEL_TIMEOUT;
+		} else {
+			ccb->ccb_h.status |= CAM_REQ_CMP;
+		}
+
 		cmd = (const struct scsi_generic *)
 		    ((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
 		     csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes);
 		if (cmd->opcode == INQUIRY) {
-		    /*
-		     * The host of Windows 10 or 2016 server will response
-		     * the inquiry request with invalid data for unexisted device:
-			[0x7f 0x0 0x5 0x2 0x1f ... ]
-		     * But on windows 2012 R2, the response is:
-			[0x7f 0x0 0x0 0x0 0x0 ]
-		     * That is why here wants to validate the inquiry response.
-		     * The validation will skip the INQUIRY whose response is short,
-		     * which is less than SHORT_INQUIRY_LENGTH (36).
-		     *
-		     * For more information about INQUIRY, please refer to:
-		     *  ftp://ftp.avc-pioneer.com/Mtfuji_7/Proposal/Jun09/INQUIRY.pdf
-		     */
-		    struct scsi_inquiry_data *inq_data =
-			(struct scsi_inquiry_data *)csio->data_ptr;
-		    uint8_t* resp_buf = (uint8_t*)csio->data_ptr;
-		    /* Get the buffer length reported by host */
-		    int resp_xfer_len = vm_srb->transfer_len;
-		    /* Get the available buffer length */
-		    int resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
-		    int data_len = (resp_buf_len < resp_xfer_len) ? resp_buf_len : resp_xfer_len;
-		    if (data_len < SHORT_INQUIRY_LENGTH) {
-			ccb->ccb_h.status |= CAM_REQ_CMP;
-			if (bootverbose && data_len >= 5) {
-				mtx_lock(&sc->hs_lock);
-				xpt_print(ccb->ccb_h.path,
-				    "storvsc skips the validation for short inquiry (%d)"
-				    " [%x %x %x %x %x]\n",
-				    data_len,resp_buf[0],resp_buf[1],resp_buf[2],
-				    resp_buf[3],resp_buf[4]);
-				mtx_unlock(&sc->hs_lock);
-			}
-		    } else if (is_inquiry_valid(inq_data) == 0) {
-			ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
+			struct scsi_inquiry_data *inq_data =
+			    (struct scsi_inquiry_data *)csio->data_ptr;
+			uint8_t *resp_buf = (uint8_t *)csio->data_ptr;
+			int resp_xfer_len, resp_buf_len, data_len;
+
+			/* Get the buffer length reported by host */
+			resp_xfer_len = vm_srb->transfer_len;
+			/* Get the available buffer length */
+			resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
+			data_len = (resp_buf_len < resp_xfer_len) ?
+			    resp_buf_len : resp_xfer_len;
+
 			if (bootverbose && data_len >= 5) {
-				mtx_lock(&sc->hs_lock);
-				xpt_print(ccb->ccb_h.path,
-				    "storvsc uninstalled invalid device"
-				    " [%x %x %x %x %x]\n",
-				resp_buf[0],resp_buf[1],resp_buf[2],resp_buf[3],resp_buf[4]);
-				mtx_unlock(&sc->hs_lock);
+				xpt_print(ccb->ccb_h.path, "storvsc inquiry "
+				    "(%d) [%x %x %x %x %x ... ]\n", data_len,
+				    resp_buf[0], resp_buf[1], resp_buf[2],
+				    resp_buf[3], resp_buf[4]);
 			}
-		    } else {
-			char vendor[16];
-			cam_strvis(vendor, inq_data->vendor, sizeof(inq_data->vendor),
-				sizeof(vendor));
-			/**
-			 * XXX: upgrade SPC2 to SPC3 if host is WIN8 or WIN2012 R2
-			 * in order to support UNMAP feature
-			 */
-			if (!strncmp(vendor,"Msft",4) &&
-			     SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
-			     (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
-				vmstor_proto_version== VMSTOR_PROTOCOL_VERSION_WIN8)) {
-				inq_data->version = SCSI_REV_SPC3;
-				if (bootverbose) {
-					mtx_lock(&sc->hs_lock);
-					xpt_print(ccb->ccb_h.path,
-						"storvsc upgrades SPC2 to SPC3\n");
-					mtx_unlock(&sc->hs_lock);
+			if (vm_srb->srb_status == SRB_STATUS_SUCCESS &&
+			    data_len > SHORT_INQUIRY_LENGTH) {
+				char vendor[16];
+
+				cam_strvis(vendor, inq_data->vendor,
+				    sizeof(inq_data->vendor), sizeof(vendor));
+
+				/*
+				 * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or
+				 * WIN2012 R2 in order to support UNMAP feature.
+				 */
+				if (!strncmp(vendor, "Msft", 4) &&
+				    SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
+				    (vmstor_proto_version ==
+				     VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
+				     vmstor_proto_version ==
+				     VMSTOR_PROTOCOL_VERSION_WIN8)) {
+					inq_data->version = SCSI_REV_SPC3;
+					if (bootverbose) {
+						xpt_print(ccb->ccb_h.path,
+						    "storvsc upgrades "
+						    "SPC2 to SPC3\n");
+					}
 				}
 			}
-			ccb->ccb_h.status |= CAM_REQ_CMP;
-			if (bootverbose) {
-				mtx_lock(&sc->hs_lock);
-				xpt_print(ccb->ccb_h.path,
-				    "storvsc has passed inquiry response (%d) validation\n",
-				    data_len);
-				mtx_unlock(&sc->hs_lock);
-			}
-		    }
-		} else {
-			ccb->ccb_h.status |= CAM_REQ_CMP;
 		}
 	} else {
 		mtx_lock(&sc->hs_lock);
diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h
index f2b9480..9205e35 100644
--- a/sys/dev/hyperv/storvsc/hv_vstorage.h
+++ b/sys/dev/hyperv/storvsc/hv_vstorage.h
@@ -249,9 +249,9 @@ struct vstor_packet {
 /**
  * SRB Status Masks (can be combined with above status codes)
  */
-#define SRB_STATUS_QUEUE_FROZEN		0x40
-#define SRB_STATUS_AUTOSENSE_VALID	0x80
-
+#define SRB_STATUS_QUEUE_FROZEN         0x40
+#define SRB_STATUS_AUTOSENSE_VALID      0x80
+#define SRB_STATUS_INVALID_LUN          0X20
 
 /**
  *  Packet flags
diff --git a/sys/dev/isp/isp.c b/sys/dev/isp/isp.c
index aa36453..9d38f60 100644
--- a/sys/dev/isp/isp.c
+++ b/sys/dev/isp/isp.c
@@ -2431,6 +2431,7 @@ isp_fc_enable_vp(ispsoftc_t *isp, int chan)
 		    __func__, chan, vp.vp_mod_hdr.rqs_flags, vp.vp_mod_status);
 		return (EIO);
 	}
+	GET_NANOTIME(&isp->isp_init_time);
 	return (0);
 }
 
@@ -5865,6 +5866,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
 		 * These are broadcast events that have to be sent across
 		 * all active channels.
 		 */
+		GET_NANOTIME(&isp->isp_init_time);
 		for (chan = 0; chan < isp->isp_nchan; chan++) {
 			fcp = FCPARAM(isp, chan);
 			int topo = fcp->isp_topo;
@@ -5921,6 +5923,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
 		 * This is a broadcast event that has to be sent across
 		 * all active channels.
 		 */
+		GET_NANOTIME(&isp->isp_init_time);
 		for (chan = 0; chan < isp->isp_nchan; chan++) {
 			fcp = FCPARAM(isp, chan);
 			if (fcp->role == ISP_ROLE_NONE)
@@ -5964,6 +5967,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
 		 * This is a broadcast event that has to be sent across
 		 * all active channels.
 		 */
+		GET_NANOTIME(&isp->isp_init_time);
 		for (chan = 0; chan < isp->isp_nchan; chan++) {
 			fcp = FCPARAM(isp, chan);
 			if (fcp->role == ISP_ROLE_NONE)
@@ -6162,6 +6166,7 @@ isp_handle_other_response(ispsoftc_t *isp, int type, isphdr_t *hp, uint32_t *opt
 		portid = (uint32_t)rid.ridacq_vp_port_hi << 16 |
 		    rid.ridacq_vp_port_lo;
 		if (rid.ridacq_format == 0) {
+			GET_NANOTIME(&isp->isp_init_time);
 			for (chan = 0; chan < isp->isp_nchan; chan++) {
 				fcparam *fcp = FCPARAM(isp, chan);
 				if (fcp->role == ISP_ROLE_NONE)
diff --git a/sys/dev/isp/isp_freebsd.c b/sys/dev/isp/isp_freebsd.c
index c6b8dc4..cfaccea 100644
--- a/sys/dev/isp/isp_freebsd.c
+++ b/sys/dev/isp/isp_freebsd.c
@@ -856,7 +856,7 @@ static void isp_handle_platform_atio7(ispsoftc_t *, at7_entry_t *);
 static void isp_handle_platform_ctio(ispsoftc_t *, void *);
 static void isp_handle_platform_notify_fc(ispsoftc_t *, in_fcentry_t *);
 static void isp_handle_platform_notify_24xx(ispsoftc_t *, in_fcentry_24xx_t *);
-static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *);
+static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *, uint32_t rsp);
 static void isp_handle_platform_target_tmf(ispsoftc_t *, isp_notify_t *);
 static void isp_target_mark_aborted(ispsoftc_t *, union ccb *);
 static void isp_target_mark_aborted_early(ispsoftc_t *, tstate_t *, uint32_t);
@@ -2003,7 +2003,7 @@ noresrc:
 	ntp = isp_get_ntpd(isp, tptr);
 	if (ntp == NULL) {
 		rls_lun_statep(isp, tptr);
-		isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_BUSY, 0);
+		isp_endcmd(isp, aep, SCSI_STATUS_BUSY, 0);
 		return;
 	}
 	memcpy(ntp->rd.data, aep, QENTRY_LEN);
@@ -2055,7 +2055,7 @@ isp_handle_platform_atio7(ispsoftc_t *isp, at7_entry_t *aep)
 			 * It's a bit tricky here as we need to stash this command *somewhere*.
 			 */
 			GET_NANOTIME(&now);
-			if (NANOTIME_SUB(&isp->isp_init_time, &now) > 2000000000ULL) {
+			if (NANOTIME_SUB(&now, &isp->isp_init_time) > 2000000000ULL) {
 				isp_prt(isp, ISP_LOGWARN, "%s: [RX_ID 0x%x] D_ID %x not found on any channel- dropping", __func__, aep->at_rxid, did);
 				isp_endcmd(isp, aep, NIL_HANDLE, ISP_NOCHAN, ECMD_TERMINATE, 0);
 				return;
@@ -2761,7 +2761,7 @@ isp_handle_platform_notify_24xx(ispsoftc_t *isp, in_fcentry_24xx_t *inot)
 }
 
 static int
-isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp)
+isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp, uint32_t rsp)
 {
 
 	if (isp->isp_state != ISP_RUNSTATE) {
@@ -2796,6 +2796,15 @@ isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp)
 		cto->ct_oxid = aep->at_hdr.ox_id;
 		cto->ct_flags = CT7_SENDSTATUS|CT7_NOACK|CT7_NO_DATA|CT7_FLAG_MODE1;
 		cto->ct_flags |= (aep->at_ta_len >> 12) << CT7_TASK_ATTR_SHIFT;
+		if (rsp != 0) {
+			cto->ct_scsi_status |= (FCP_RSPLEN_VALID << 8);
+			cto->rsp.m1.ct_resplen = 4;
+			ISP_MEMZERO(cto->rsp.m1.ct_resp, sizeof (cto->rsp.m1.ct_resp));
+			cto->rsp.m1.ct_resp[0] = rsp & 0xff;
+			cto->rsp.m1.ct_resp[1] = (rsp >> 8) & 0xff;
+			cto->rsp.m1.ct_resp[2] = (rsp >> 16) & 0xff;
+			cto->rsp.m1.ct_resp[3] = (rsp >> 24) & 0xff;
+		}
 		return (isp_target_put_entry(isp, &local));
 	}
 
@@ -3642,7 +3651,8 @@ isp_action(struct cam_sim *sim, union ccb *ccb)
 			xpt_done(ccb);
 			break;
 		}
-		if (isp_handle_platform_target_notify_ack(isp, &ntp->rd.nt)) {
+		if (isp_handle_platform_target_notify_ack(isp, &ntp->rd.nt,
+		    (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0)) {
 			rls_lun_statep(isp, tptr);
 			cam_freeze_devq(ccb->ccb_h.path);
 			cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 1000, 0);
@@ -4407,11 +4417,11 @@ changed:
 			/*
 			 * This is device arrival/departure notification
 			 */
-			isp_handle_platform_target_notify_ack(isp, notify);
+			isp_handle_platform_target_notify_ack(isp, notify, 0);
 			break;
 		default:
 			isp_prt(isp, ISP_LOGALL, "target notify code 0x%x", notify->nt_ncode);
-			isp_handle_platform_target_notify_ack(isp, notify);
+			isp_handle_platform_target_notify_ack(isp, notify, 0);
 			break;
 		}
 		break;
diff --git a/sys/dev/ntb/if_ntb/if_ntb.c b/sys/dev/ntb/if_ntb/if_ntb.c
index d107d06..33645c4 100644
--- a/sys/dev/ntb/if_ntb/if_ntb.c
+++ b/sys/dev/ntb/if_ntb/if_ntb.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
@@ -25,21 +26,27 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a driver for simulated Ethernet device, using
+ * underlying NTB Transport device.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
-#include <sys/bitset.h>
+#include <sys/buf_ring.h>
 #include <sys/bus.h>
-#include <sys/ktr.h>
 #include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
 #include <sys/module.h>
-#include <sys/mutex.h>
-#include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
@@ -48,426 +55,163 @@ __FBSDID("$FreeBSD$");
 #include <net/if.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
+#include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
 #include <machine/bus.h>
-#include <machine/cpufunc.h>
-#include <machine/pmap.h>
-
-#include <netinet/in.h>
-#include <netinet/ip.h>
-
-#include "../ntb_hw/ntb_hw.h"
-
-/*
- * The Non-Transparent Bridge (NTB) is a device on some Intel processors that
- * allows you to connect two systems using a PCI-e link.
- *
- * This module contains a protocol for sending and receiving messages, and
- * exposes that protocol through a simulated ethernet device called ntb.
- *
- * NOTE: Much of the code in this module is shared with Linux. Any patches may
- * be picked up and redistributed in Linux with a dual GPL/BSD license.
- */
 
-#define QP_SETSIZE	64
-BITSET_DEFINE(_qpset, QP_SETSIZE);
-#define test_bit(pos, addr)	BIT_ISSET(QP_SETSIZE, (pos), (addr))
-#define set_bit(pos, addr)	BIT_SET(QP_SETSIZE, (pos), (addr))
-#define clear_bit(pos, addr)	BIT_CLR(QP_SETSIZE, (pos), (addr))
-#define ffs_bit(addr)		BIT_FFS(QP_SETSIZE, (addr))
+#include "../ntb_transport.h"
 
 #define KTR_NTB KTR_SPARE3
+#define NTB_MEDIATYPE		 (IFM_ETHER | IFM_AUTO | IFM_FDX)
 
-#define NTB_TRANSPORT_VERSION	4
-#define NTB_RX_MAX_PKTS		64
-#define	NTB_RXQ_SIZE		300
-
-enum ntb_link_event {
-	NTB_LINK_DOWN = 0,
-	NTB_LINK_UP,
-};
+#define	NTB_CSUM_FEATURES	(CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)
+#define	NTB_CSUM_FEATURES6	(CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6)
+#define	NTB_CSUM_SET		(CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \
+				    CSUM_PSEUDO_HDR | \
+				    CSUM_IP_CHECKED | CSUM_IP_VALID | \
+				    CSUM_SCTP_VALID)
 
 static SYSCTL_NODE(_hw, OID_AUTO, if_ntb, CTLFLAG_RW, 0, "if_ntb");
 
-static unsigned g_if_ntb_debug_level;
-TUNABLE_INT("hw.if_ntb.debug_level", &g_if_ntb_debug_level);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, debug_level, CTLFLAG_RWTUN,
-    &g_if_ntb_debug_level, 0, "if_ntb log level -- higher is more verbose");
-#define ntb_printf(lvl, ...) do {			\
-	if ((lvl) <= g_if_ntb_debug_level) {		\
-		if_printf(nt->ifp, __VA_ARGS__);	\
-	}						\
-} while (0)
-
-static unsigned transport_mtu = IP_MAXPACKET + ETHER_HDR_LEN + ETHER_CRC_LEN;
-
-static uint64_t max_mw_size;
-TUNABLE_QUAD("hw.if_ntb.max_mw_size", &max_mw_size);
-SYSCTL_UQUAD(_hw_if_ntb, OID_AUTO, max_mw_size, CTLFLAG_RDTUN, &max_mw_size, 0,
-    "If enabled (non-zero), limit the size of large memory windows. "
-    "Both sides of the NTB MUST set the same value here.");
-
-static unsigned max_num_clients;
-TUNABLE_INT("hw.if_ntb.max_num_clients", &max_num_clients);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, max_num_clients, CTLFLAG_RDTUN,
-    &max_num_clients, 0, "Maximum number of NTB transport clients.  "
-    "0 (default) - use all available NTB memory windows; "
-    "positive integer N - Limit to N memory windows.");
-
-static unsigned enable_xeon_watchdog;
-TUNABLE_INT("hw.if_ntb.enable_xeon_watchdog", &enable_xeon_watchdog);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, enable_xeon_watchdog, CTLFLAG_RDTUN,
-    &enable_xeon_watchdog, 0, "If non-zero, write a register every second to "
-    "keep a watchdog from tearing down the NTB link");
-
-STAILQ_HEAD(ntb_queue_list, ntb_queue_entry);
-
-typedef uint32_t ntb_q_idx_t;
-
-struct ntb_queue_entry {
-	/* ntb_queue list reference */
-	STAILQ_ENTRY(ntb_queue_entry) entry;
-
-	/* info on data to be transferred */
-	void		*cb_data;
-	void		*buf;
-	uint32_t	len;
-	uint32_t	flags;
-
-	struct ntb_transport_qp		*qp;
-	struct ntb_payload_header	*x_hdr;
-	ntb_q_idx_t	index;
-};
-
-struct ntb_rx_info {
-	ntb_q_idx_t	entry;
-};
-
-struct ntb_transport_qp {
-	struct ntb_transport_ctx	*transport;
-	struct ntb_softc	*ntb;
-
-	void			*cb_data;
-
-	bool			client_ready;
-	volatile bool		link_is_up;
-	uint8_t			qp_num;	/* Only 64 QPs are allowed.  0-63 */
-
-	struct ntb_rx_info	*rx_info;
-	struct ntb_rx_info	*remote_rx_info;
-
-	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	struct ntb_queue_list	tx_free_q;
-	struct mtx		ntb_tx_free_q_lock;
-	caddr_t			tx_mw;
-	bus_addr_t		tx_mw_phys;
-	ntb_q_idx_t		tx_index;
-	ntb_q_idx_t		tx_max_entry;
-	uint64_t		tx_max_frame;
-
-	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	struct ntb_queue_list	rx_post_q;
-	struct ntb_queue_list	rx_pend_q;
-	/* ntb_rx_q_lock: synchronize access to rx_XXXX_q */
-	struct mtx		ntb_rx_q_lock;
-	struct task		rx_completion_task;
-	struct task		rxc_db_work;
-	caddr_t			rx_buff;
-	ntb_q_idx_t		rx_index;
-	ntb_q_idx_t		rx_max_entry;
-	uint64_t		rx_max_frame;
-
-	void (*event_handler)(void *data, enum ntb_link_event status);
-	struct callout		link_work;
-	struct callout		queue_full;
-	struct callout		rx_full;
-
-	uint64_t		last_rx_no_buf;
+static unsigned g_if_ntb_num_queues = UINT_MAX;
+SYSCTL_UINT(_hw_if_ntb, OID_AUTO, num_queues, CTLFLAG_RWTUN,
+    &g_if_ntb_num_queues, 0, "Number of queues per interface");
 
-	/* Stats */
-	uint64_t		rx_bytes;
-	uint64_t		rx_pkts;
-	uint64_t		rx_ring_empty;
-	uint64_t		rx_err_no_buf;
-	uint64_t		rx_err_oflow;
-	uint64_t		rx_err_ver;
-	uint64_t		tx_bytes;
-	uint64_t		tx_pkts;
-	uint64_t		tx_ring_full;
-	uint64_t		tx_err_no_buf;
-};
-
-struct ntb_queue_handlers {
-	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	void (*event_handler)(void *data, enum ntb_link_event status);
-};
-
-struct ntb_transport_mw {
-	vm_paddr_t	phys_addr;
-	size_t		phys_size;
-	size_t		xlat_align;
-	size_t		xlat_align_size;
-	bus_addr_t	addr_limit;
-	/* Tx buff is off vbase / phys_addr */
-	caddr_t		vbase;
-	size_t		xlat_size;
-	size_t		buff_size;
-	/* Rx buff is off virt_addr / dma_addr */
-	caddr_t		virt_addr;
-	bus_addr_t	dma_addr;
-};
-
-struct ntb_transport_ctx {
-	struct ntb_softc	*ntb;
+struct ntb_net_queue {
+	struct ntb_net_ctx	*sc;
 	struct ifnet		*ifp;
-	struct ntb_transport_mw	mw_vec[NTB_MAX_NUM_MW];
-	struct ntb_transport_qp	*qp_vec;
-	struct _qpset		qp_bitmap;
-	struct _qpset		qp_bitmap_free;
-	unsigned		mw_count;
-	unsigned		qp_count;
-	volatile bool		link_is_up;
-	struct callout		link_work;
-	struct callout		link_watchdog;
-	struct task		link_cleanup;
-	uint64_t		bufsize;
-	u_char			eaddr[ETHER_ADDR_LEN];
-	struct mtx		tx_lock;
-	struct mtx		rx_lock;
-
-	/* The hardcoded single queuepair in ntb_setup_interface() */
 	struct ntb_transport_qp *qp;
+	struct buf_ring		*br;
+	struct task		 tx_task;
+	struct taskqueue	*tx_tq;
+	struct mtx		 tx_lock;
+	struct callout		 queue_full;
 };
 
-static struct ntb_transport_ctx net_softc;
-
-enum {
-	IF_NTB_DESC_DONE_FLAG = 1 << 0,
-	IF_NTB_LINK_DOWN_FLAG = 1 << 1,
-};
-
-struct ntb_payload_header {
-	ntb_q_idx_t ver;
-	uint32_t len;
-	uint32_t flags;
-};
-
-enum {
-	/*
-	 * The order of this enum is part of the if_ntb remote protocol.  Do
-	 * not reorder without bumping protocol version (and it's probably best
-	 * to keep the protocol in lock-step with the Linux NTB driver.
-	 */
-	IF_NTB_VERSION = 0,
-	IF_NTB_QP_LINKS,
-	IF_NTB_NUM_QPS,
-	IF_NTB_NUM_MWS,
-	/*
-	 * N.B.: transport_link_work assumes MW1 enums = MW0 + 2.
-	 */
-	IF_NTB_MW0_SZ_HIGH,
-	IF_NTB_MW0_SZ_LOW,
-	IF_NTB_MW1_SZ_HIGH,
-	IF_NTB_MW1_SZ_LOW,
-	IF_NTB_MAX_SPAD,
-
-	/*
-	 * Some NTB-using hardware have a watchdog to work around NTB hangs; if
-	 * a register or doorbell isn't written every few seconds, the link is
-	 * torn down.  Write an otherwise unused register every few seconds to
-	 * work around this watchdog.
-	 */
-	IF_NTB_WATCHDOG_SPAD = 15
+struct ntb_net_ctx {
+	struct ifnet 		*ifp;
+	struct ifmedia		 media;
+	u_char			 eaddr[ETHER_ADDR_LEN];
+	int			 num_queues;
+	struct ntb_net_queue	*queues;
+	int			 mtu;
 };
-CTASSERT(IF_NTB_WATCHDOG_SPAD < XEON_SPAD_COUNT &&
-    IF_NTB_WATCHDOG_SPAD < ATOM_SPAD_COUNT);
-
-#define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
-#define NTB_QP_DEF_NUM_ENTRIES	100
-#define NTB_LINK_DOWN_TIMEOUT	10
 
-static int ntb_handle_module_events(struct module *m, int what, void *arg);
-static int ntb_setup_interface(void);
-static int ntb_teardown_interface(void);
+static int ntb_net_probe(device_t dev);
+static int ntb_net_attach(device_t dev);
+static int ntb_net_detach(device_t dev);
 static void ntb_net_init(void *arg);
+static int ntb_ifmedia_upd(struct ifnet *);
+static void ntb_ifmedia_sts(struct ifnet *, struct ifmediareq *);
 static int ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
-static void ntb_start(struct ifnet *ifp);
+static int ntb_transmit(struct ifnet *ifp, struct mbuf *m);
 static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
     void *data, int len);
 static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data,
     void *data, int len);
 static void ntb_net_event_handler(void *data, enum ntb_link_event status);
-static int ntb_transport_probe(struct ntb_softc *ntb);
-static void ntb_transport_free(struct ntb_transport_ctx *);
-static void ntb_transport_init_queue(struct ntb_transport_ctx *nt,
-    unsigned int qp_num);
-static void ntb_transport_free_queue(struct ntb_transport_qp *qp);
-static struct ntb_transport_qp *ntb_transport_create_queue(void *data,
-    struct ntb_softc *pdev, const struct ntb_queue_handlers *handlers);
-static void ntb_transport_link_up(struct ntb_transport_qp *qp);
-static int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb,
-    void *data, unsigned int len);
-static int ntb_process_tx(struct ntb_transport_qp *qp,
-    struct ntb_queue_entry *entry);
-static void ntb_memcpy_tx(struct ntb_transport_qp *qp,
-    struct ntb_queue_entry *entry, void *offset);
+static void ntb_handle_tx(void *arg, int pending);
 static void ntb_qp_full(void *arg);
-static void ntb_transport_rxc_db(void *arg, int pending);
-static int ntb_process_rxc(struct ntb_transport_qp *qp);
-static void ntb_memcpy_rx(struct ntb_transport_qp *qp,
-    struct ntb_queue_entry *entry, void *offset);
-static inline void ntb_rx_copy_callback(struct ntb_transport_qp *qp,
-    void *data);
-static void ntb_complete_rxc(void *arg, int pending);
-static void ntb_transport_doorbell_callback(void *data, uint32_t vector);
-static void ntb_transport_event_callback(void *data);
-static void ntb_transport_link_work(void *arg);
-static int ntb_set_mw(struct ntb_transport_ctx *, int num_mw, size_t size);
-static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw);
-static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
-    unsigned int qp_num);
-static void ntb_qp_link_work(void *arg);
-static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt);
-static void ntb_transport_link_cleanup_work(void *, int);
-static void ntb_qp_link_down(struct ntb_transport_qp *qp);
-static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp);
-static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp);
-static void ntb_transport_link_down(struct ntb_transport_qp *qp);
-static void ntb_send_link_down(struct ntb_transport_qp *qp);
-static void ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
-    struct ntb_queue_list *list);
-static struct ntb_queue_entry *ntb_list_rm(struct mtx *lock,
-    struct ntb_queue_list *list);
-static struct ntb_queue_entry *ntb_list_mv(struct mtx *lock,
-    struct ntb_queue_list *from, struct ntb_queue_list *to);
+static void ntb_qflush(struct ifnet *ifp);
 static void create_random_local_eui48(u_char *eaddr);
-static unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
-static void xeon_link_watchdog_hb(void *);
-
-static const struct ntb_ctx_ops ntb_transport_ops = {
-	.link_event = ntb_transport_event_callback,
-	.db_event = ntb_transport_doorbell_callback,
-};
 
-MALLOC_DEFINE(M_NTB_IF, "if_ntb", "ntb network driver");
-
-static inline void
-iowrite32(uint32_t val, void *addr)
-{
-
-	bus_space_write_4(X86_BUS_SPACE_MEM, 0/* HACK */, (uintptr_t)addr,
-	    val);
-}
-
-/* Module setup and teardown */
 static int
-ntb_handle_module_events(struct module *m, int what, void *arg)
+ntb_net_probe(device_t dev)
 {
-	int err = 0;
 
-	switch (what) {
-	case MOD_LOAD:
-		err = ntb_setup_interface();
-		break;
-	case MOD_UNLOAD:
-		err = ntb_teardown_interface();
-		break;
-	default:
-		err = EOPNOTSUPP;
-		break;
-	}
-	return (err);
+	device_set_desc(dev, "NTB Network Interface");
+	return (0);
 }
 
-static moduledata_t if_ntb_mod = {
-	"if_ntb",
-	ntb_handle_module_events,
-	NULL
-};
-
-DECLARE_MODULE(if_ntb, if_ntb_mod, SI_SUB_KLD, SI_ORDER_ANY);
-MODULE_DEPEND(if_ntb, ntb_hw, 1, 1, 1);
-
 static int
-ntb_setup_interface(void)
+ntb_net_attach(device_t dev)
 {
+	struct ntb_net_ctx *sc = device_get_softc(dev);
+	struct ntb_net_queue *q;
 	struct ifnet *ifp;
 	struct ntb_queue_handlers handlers = { ntb_net_rx_handler,
 	    ntb_net_tx_handler, ntb_net_event_handler };
-	int rc;
-
-	net_softc.ntb = devclass_get_softc(devclass_find("ntb_hw"), 0);
-	if (net_softc.ntb == NULL) {
-		printf("ntb: Cannot find devclass\n");
-		return (ENXIO);
-	}
+	int i;
 
-	ifp = net_softc.ifp = if_alloc(IFT_ETHER);
+	ifp = sc->ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
-		ntb_transport_free(&net_softc);
 		printf("ntb: Cannot allocate ifnet structure\n");
 		return (ENOMEM);
 	}
-	if_initname(ifp, "ntb", 0);
-
-	rc = ntb_transport_probe(net_softc.ntb);
-	if (rc != 0) {
-		printf("ntb: Cannot init transport: %d\n", rc);
-		if_free(net_softc.ifp);
-		return (rc);
-	}
+	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+
+	sc->num_queues = min(g_if_ntb_num_queues,
+	    ntb_transport_queue_count(dev));
+	sc->queues = malloc(sc->num_queues * sizeof(struct ntb_net_queue),
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+	sc->mtu = INT_MAX;
+	for (i = 0; i < sc->num_queues; i++) {
+		q = &sc->queues[i];
+		q->sc = sc;
+		q->ifp = ifp;
+		q->qp = ntb_transport_create_queue(dev, i, &handlers, q);
+		if (q->qp == NULL)
+			break;
+		sc->mtu = imin(sc->mtu, ntb_transport_max_size(q->qp));
+		mtx_init(&q->tx_lock, "ntb tx", NULL, MTX_DEF);
+		q->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &q->tx_lock);
+		TASK_INIT(&q->tx_task, 0, ntb_handle_tx, q);
+		q->tx_tq = taskqueue_create_fast("ntb_txq", M_NOWAIT,
+		    taskqueue_thread_enqueue, &q->tx_tq);
+		taskqueue_start_threads(&q->tx_tq, 1, PI_NET, "%s txq%d",
+		    device_get_nameunit(dev), i);
+		callout_init(&q->queue_full, 1);
+	}
+	sc->num_queues = i;
+	device_printf(dev, "%d queue(s)\n", sc->num_queues);
 
-	net_softc.qp = ntb_transport_create_queue(ifp, net_softc.ntb,
-	    &handlers);
 	ifp->if_init = ntb_net_init;
-	ifp->if_softc = &net_softc;
-	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX;
+	ifp->if_softc = sc;
+	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = ntb_ioctl;
-	ifp->if_start = ntb_start;
-	IFQ_SET_MAXLEN(&ifp->if_snd, IFQ_MAXLEN);
-	ifp->if_snd.ifq_drv_maxlen = IFQ_MAXLEN;
-	IFQ_SET_READY(&ifp->if_snd);
-	create_random_local_eui48(net_softc.eaddr);
-	ether_ifattach(ifp, net_softc.eaddr);
-	ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_JUMBO_MTU;
-	ifp->if_capenable = ifp->if_capabilities;
-	ifp->if_mtu = ntb_transport_max_size(net_softc.qp) - ETHER_HDR_LEN -
-	    ETHER_CRC_LEN;
-
-	ntb_transport_link_up(net_softc.qp);
-	net_softc.bufsize = ntb_transport_max_size(net_softc.qp) +
-	    sizeof(struct ether_header);
+	ifp->if_transmit = ntb_transmit;
+	ifp->if_qflush = ntb_qflush;
+	create_random_local_eui48(sc->eaddr);
+	ether_ifattach(ifp, sc->eaddr);
+	ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 |
+	    IFCAP_JUMBO_MTU | IFCAP_LINKSTATE;
+	ifp->if_capenable = IFCAP_JUMBO_MTU | IFCAP_LINKSTATE;
+	ifp->if_mtu = sc->mtu - ETHER_HDR_LEN;
+
+	ifmedia_init(&sc->media, IFM_IMASK, ntb_ifmedia_upd,
+	    ntb_ifmedia_sts);
+	ifmedia_add(&sc->media, NTB_MEDIATYPE, 0, NULL);
+	ifmedia_set(&sc->media, NTB_MEDIATYPE);
+
+	for (i = 0; i < sc->num_queues; i++)
+		ntb_transport_link_up(sc->queues[i].qp);
 	return (0);
 }
 
 static int
-ntb_teardown_interface(void)
+ntb_net_detach(device_t dev)
 {
+	struct ntb_net_ctx *sc = device_get_softc(dev);
+	struct ntb_net_queue *q;
+	int i;
 
-	if (net_softc.qp != NULL) {
-		ntb_transport_link_down(net_softc.qp);
-
-		ntb_transport_free_queue(net_softc.qp);
-		ntb_transport_free(&net_softc);
-	}
-
-	if (net_softc.ifp != NULL) {
-		ether_ifdetach(net_softc.ifp);
-		if_free(net_softc.ifp);
-		net_softc.ifp = NULL;
-	}
-
+	for (i = 0; i < sc->num_queues; i++)
+		ntb_transport_link_down(sc->queues[i].qp);
+	ether_ifdetach(sc->ifp);
+	if_free(sc->ifp);
+	ifmedia_removeall(&sc->media);
+	for (i = 0; i < sc->num_queues; i++) {
+		q = &sc->queues[i];
+		ntb_transport_free_queue(q->qp);
+		buf_ring_free(q->br, M_DEVBUF);
+		callout_drain(&q->queue_full);
+		taskqueue_drain_all(q->tx_tq);
+		mtx_destroy(&q->tx_lock);
+	}
+	free(sc->queues, M_DEVBUF);
 	return (0);
 }
 
@@ -476,27 +220,26 @@ ntb_teardown_interface(void)
 static void
 ntb_net_init(void *arg)
 {
-	struct ntb_transport_ctx *ntb_softc = arg;
-	struct ifnet *ifp = ntb_softc->ifp;
+	struct ntb_net_ctx *sc = arg;
+	struct ifnet *ifp = sc->ifp;
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
-	ifp->if_flags |= IFF_UP;
-	if_link_state_change(ifp, LINK_STATE_UP);
+	if_link_state_change(ifp, ntb_transport_link_query(sc->queues[0].qp) ?
+	    LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static int
 ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
-	struct ntb_transport_ctx *nt = ifp->if_softc;
+	struct ntb_net_ctx *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFMTU:
 	    {
-		if (ifr->ifr_mtu > ntb_transport_max_size(nt->qp) -
-		    ETHER_HDR_LEN - ETHER_CRC_LEN) {
+		if (ifr->ifr_mtu > sc->mtu - ETHER_HDR_LEN) {
 			error = EINVAL;
 			break;
 		}
@@ -504,1185 +247,242 @@ ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	    }
-	default:
-		error = ether_ioctl(ifp, command, data);
-		break;
-	}
-
-	return (error);
-}
 
+	case SIOCSIFMEDIA:
+	case SIOCGIFMEDIA:
+		error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
+		break;
 
-static void
-ntb_start(struct ifnet *ifp)
-{
-	struct mbuf *m_head;
-	struct ntb_transport_ctx *nt = ifp->if_softc;
-	int rc;
-
-	mtx_lock(&nt->tx_lock);
-	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
-	CTR0(KTR_NTB, "TX: ntb_start");
-	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
-		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
-		CTR1(KTR_NTB, "TX: start mbuf %p", m_head);
-		rc = ntb_transport_tx_enqueue(nt->qp, m_head, m_head,
-			     m_length(m_head, NULL));
-		if (rc != 0) {
-			CTR1(KTR_NTB,
-			    "TX: could not tx mbuf %p. Returning to snd q",
-			    m_head);
-			if (rc == EAGAIN) {
-				ifp->if_drv_flags |= IFF_DRV_OACTIVE;
-				IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
-				callout_reset(&nt->qp->queue_full, hz / 1000,
-				    ntb_qp_full, ifp);
-			}
-			break;
+	case SIOCSIFCAP:
+		if (ifr->ifr_reqcap & IFCAP_RXCSUM)
+			ifp->if_capenable |= IFCAP_RXCSUM;
+		else
+			ifp->if_capenable &= ~IFCAP_RXCSUM;
+		if (ifr->ifr_reqcap & IFCAP_TXCSUM) {
+			ifp->if_capenable |= IFCAP_TXCSUM;
+			ifp->if_hwassist |= NTB_CSUM_FEATURES;
+		} else {
+			ifp->if_capenable &= ~IFCAP_TXCSUM;
+			ifp->if_hwassist &= ~NTB_CSUM_FEATURES;
+		}
+		if (ifr->ifr_reqcap & IFCAP_RXCSUM_IPV6)
+			ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
+		else
+			ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
+		if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6) {
+			ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
+			ifp->if_hwassist |= NTB_CSUM_FEATURES6;
+		} else {
+			ifp->if_capenable &= ~IFCAP_TXCSUM_IPV6;
+			ifp->if_hwassist &= ~NTB_CSUM_FEATURES6;
 		}
-
-	}
-	mtx_unlock(&nt->tx_lock);
-}
-
-/* Network Device Callbacks */
-static void
-ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
-    int len)
-{
-
-	m_freem(data);
-	CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
-}
-
-static void
-ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
-    int len)
-{
-	struct mbuf *m = data;
-	struct ifnet *ifp = qp_data;
-
-	CTR0(KTR_NTB, "RX: rx handler");
-	(*ifp->if_input)(ifp, m);
-}
-
-static void
-ntb_net_event_handler(void *data, enum ntb_link_event status)
-{
-	struct ifnet *ifp;
-
-	ifp = data;
-	(void)ifp;
-
-	/* XXX The Linux driver munges with the carrier status here. */
-
-	switch (status) {
-	case NTB_LINK_DOWN:
-		break;
-	case NTB_LINK_UP:
 		break;
-	default:
-		panic("Bogus ntb_link_event %u\n", status);
-	}
-}
-
-/* Transport Init and teardown */
-
-static void
-xeon_link_watchdog_hb(void *arg)
-{
-	struct ntb_transport_ctx *nt;
-
-	nt = arg;
-	ntb_spad_write(nt->ntb, IF_NTB_WATCHDOG_SPAD, 0);
-	callout_reset(&nt->link_watchdog, 1 * hz, xeon_link_watchdog_hb, nt);
-}
-
-static int
-ntb_transport_probe(struct ntb_softc *ntb)
-{
-	struct ntb_transport_ctx *nt = &net_softc;
-	struct ntb_transport_mw *mw;
-	uint64_t qp_bitmap;
-	int rc;
-	unsigned i;
-
-	nt->mw_count = ntb_mw_count(ntb);
-	for (i = 0; i < nt->mw_count; i++) {
-		mw = &nt->mw_vec[i];
-
-		rc = ntb_mw_get_range(ntb, i, &mw->phys_addr, &mw->vbase,
-		    &mw->phys_size, &mw->xlat_align, &mw->xlat_align_size,
-		    &mw->addr_limit);
-		if (rc != 0)
-			goto err;
-
-		mw->buff_size = 0;
-		mw->xlat_size = 0;
-		mw->virt_addr = NULL;
-		mw->dma_addr = 0;
-
-		rc = ntb_mw_set_wc(nt->ntb, i, VM_MEMATTR_WRITE_COMBINING);
-		if (rc)
-			ntb_printf(0, "Unable to set mw%d caching\n", i);
-	}
-
-	qp_bitmap = ntb_db_valid_mask(ntb);
-	nt->qp_count = flsll(qp_bitmap);
-	KASSERT(nt->qp_count != 0, ("bogus db bitmap"));
-	nt->qp_count -= 1;
-
-	if (max_num_clients != 0 && max_num_clients < nt->qp_count)
-		nt->qp_count = max_num_clients;
-	else if (nt->mw_count < nt->qp_count)
-		nt->qp_count = nt->mw_count;
-	KASSERT(nt->qp_count <= QP_SETSIZE, ("invalid qp_count"));
-
-	mtx_init(&nt->tx_lock, "ntb transport tx", NULL, MTX_DEF);
-	mtx_init(&nt->rx_lock, "ntb transport rx", NULL, MTX_DEF);
-
-	nt->qp_vec = malloc(nt->qp_count * sizeof(*nt->qp_vec), M_NTB_IF,
-	    M_WAITOK | M_ZERO);
-
-	for (i = 0; i < nt->qp_count; i++) {
-		set_bit(i, &nt->qp_bitmap);
-		set_bit(i, &nt->qp_bitmap_free);
-		ntb_transport_init_queue(nt, i);
-	}
-
-	callout_init(&nt->link_work, 0);
-	callout_init(&nt->link_watchdog, 0);
-	TASK_INIT(&nt->link_cleanup, 0, ntb_transport_link_cleanup_work, nt);
-
-	rc = ntb_set_ctx(ntb, nt, &ntb_transport_ops);
-	if (rc != 0)
-		goto err;
-
-	nt->link_is_up = false;
-	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
-	ntb_link_event(ntb);
-
-	callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
-	if (enable_xeon_watchdog != 0)
-		callout_reset(&nt->link_watchdog, 0, xeon_link_watchdog_hb, nt);
-	return (0);
-
-err:
-	free(nt->qp_vec, M_NTB_IF);
-	nt->qp_vec = NULL;
-	return (rc);
-}
-
-static void
-ntb_transport_free(struct ntb_transport_ctx *nt)
-{
-	struct ntb_softc *ntb = nt->ntb;
-	struct _qpset qp_bitmap_alloc;
-	uint8_t i;
-
-	ntb_transport_link_cleanup(nt);
-	taskqueue_drain(taskqueue_swi, &nt->link_cleanup);
-	callout_drain(&nt->link_work);
-	callout_drain(&nt->link_watchdog);
-
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
-	BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
-	/* Verify that all the QPs are freed */
-	for (i = 0; i < nt->qp_count; i++)
-		if (test_bit(i, &qp_bitmap_alloc))
-			ntb_transport_free_queue(&nt->qp_vec[i]);
-
-	ntb_link_disable(ntb);
-	ntb_clear_ctx(ntb);
-
-	for (i = 0; i < nt->mw_count; i++)
-		ntb_free_mw(nt, i);
-
-	free(nt->qp_vec, M_NTB_IF);
-}
-
-static void
-ntb_transport_init_queue(struct ntb_transport_ctx *nt, unsigned int qp_num)
-{
-	struct ntb_transport_mw *mw;
-	struct ntb_transport_qp *qp;
-	vm_paddr_t mw_base;
-	uint64_t mw_size, qp_offset;
-	size_t tx_size;
-	unsigned num_qps_mw, mw_num, mw_count;
-
-	mw_count = nt->mw_count;
-	mw_num = QP_TO_MW(nt, qp_num);
-	mw = &nt->mw_vec[mw_num];
-
-	qp = &nt->qp_vec[qp_num];
-	qp->qp_num = qp_num;
-	qp->transport = nt;
-	qp->ntb = nt->ntb;
-	qp->client_ready = false;
-	qp->event_handler = NULL;
-	ntb_qp_link_down_reset(qp);
-
-	if (nt->qp_count % mw_count && mw_num + 1 < nt->qp_count / mw_count)
-		num_qps_mw = nt->qp_count / mw_count + 1;
-	else
-		num_qps_mw = nt->qp_count / mw_count;
-
-	mw_base = mw->phys_addr;
-	mw_size = mw->phys_size;
-
-	tx_size = mw_size / num_qps_mw;
-	qp_offset = tx_size * (qp_num / mw_count);
-
-	qp->tx_mw = mw->vbase + qp_offset;
-	KASSERT(qp->tx_mw != NULL, ("uh oh?"));
-
-	/* XXX Assumes that a vm_paddr_t is equivalent to bus_addr_t */
-	qp->tx_mw_phys = mw_base + qp_offset;
-	KASSERT(qp->tx_mw_phys != 0, ("uh oh?"));
-
-	tx_size -= sizeof(struct ntb_rx_info);
-	qp->rx_info = (void *)(qp->tx_mw + tx_size);
-
-	/* Due to house-keeping, there must be at least 2 buffs */
-	qp->tx_max_frame = qmin(tx_size / 2,
-	    transport_mtu + sizeof(struct ntb_payload_header));
-	qp->tx_max_entry = tx_size / qp->tx_max_frame;
-
-	callout_init(&qp->link_work, 0);
-	callout_init(&qp->queue_full, CALLOUT_MPSAFE);
-	callout_init(&qp->rx_full, CALLOUT_MPSAFE);
-
-	mtx_init(&qp->ntb_rx_q_lock, "ntb rx q", NULL, MTX_SPIN);
-	mtx_init(&qp->ntb_tx_free_q_lock, "ntb tx free q", NULL, MTX_SPIN);
-	TASK_INIT(&qp->rx_completion_task, 0, ntb_complete_rxc, qp);
-	TASK_INIT(&qp->rxc_db_work, 0, ntb_transport_rxc_db, qp);
-
-	STAILQ_INIT(&qp->rx_post_q);
-	STAILQ_INIT(&qp->rx_pend_q);
-	STAILQ_INIT(&qp->tx_free_q);
 
-	callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
-}
-
-static void
-ntb_transport_free_queue(struct ntb_transport_qp *qp)
-{
-	struct ntb_queue_entry *entry;
-
-	if (qp == NULL)
-		return;
-
-	callout_drain(&qp->link_work);
-
-	ntb_db_set_mask(qp->ntb, 1ull << qp->qp_num);
-	taskqueue_drain(taskqueue_swi, &qp->rxc_db_work);
-	taskqueue_drain(taskqueue_swi, &qp->rx_completion_task);
-
-	qp->cb_data = NULL;
-	qp->rx_handler = NULL;
-	qp->tx_handler = NULL;
-	qp->event_handler = NULL;
-
-	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_pend_q)))
-		free(entry, M_NTB_IF);
-
-	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_post_q)))
-		free(entry, M_NTB_IF);
-
-	while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
-		free(entry, M_NTB_IF);
-
-	set_bit(qp->qp_num, &qp->transport->qp_bitmap_free);
-}
-
-/**
- * ntb_transport_create_queue - Create a new NTB transport layer queue
- * @rx_handler: receive callback function
- * @tx_handler: transmit callback function
- * @event_handler: event callback function
- *
- * Create a new NTB transport layer queue and provide the queue with a callback
- * routine for both transmit and receive.  The receive callback routine will be
- * used to pass up data when the transport has received it on the queue.   The
- * transmit callback routine will be called when the transport has completed the
- * transmission of the data on the queue and the data is ready to be freed.
- *
- * RETURNS: pointer to newly created ntb_queue, NULL on error.
- */
-static struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct ntb_softc *ntb,
-    const struct ntb_queue_handlers *handlers)
-{
-	struct ntb_queue_entry *entry;
-	struct ntb_transport_qp *qp;
-	struct ntb_transport_ctx *nt;
-	unsigned int free_queue;
-	int i;
-
-	nt = ntb_get_ctx(ntb, NULL);
-	KASSERT(nt != NULL, ("bogus"));
-
-	free_queue = ffs_bit(&nt->qp_bitmap);
-	if (free_queue == 0)
-		return (NULL);
-
-	/* decrement free_queue to make it zero based */
-	free_queue--;
-
-	qp = &nt->qp_vec[free_queue];
-	clear_bit(qp->qp_num, &nt->qp_bitmap_free);
-	qp->cb_data = data;
-	qp->rx_handler = handlers->rx_handler;
-	qp->tx_handler = handlers->tx_handler;
-	qp->event_handler = handlers->event_handler;
-
-	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = malloc(sizeof(*entry), M_NTB_IF, M_WAITOK | M_ZERO);
-		entry->cb_data = nt->ifp;
-		entry->buf = NULL;
-		entry->len = transport_mtu;
-		ntb_list_add(&qp->ntb_rx_q_lock, entry, &qp->rx_pend_q);
-	}
-
-	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = malloc(sizeof(*entry), M_NTB_IF, M_WAITOK | M_ZERO);
-		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+	default:
+		error = ether_ioctl(ifp, command, data);
+		break;
 	}
 
-	ntb_db_clear(ntb, 1ull << qp->qp_num);
-	ntb_db_clear_mask(ntb, 1ull << qp->qp_num);
-	return (qp);
-}
-
-/**
- * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
- * @qp: NTB transport layer queue to be enabled
- *
- * Notify NTB transport layer of client readiness to use queue
- */
-static void
-ntb_transport_link_up(struct ntb_transport_qp *qp)
-{
-	struct ntb_transport_ctx *nt;
-
-	if (qp == NULL)
-		return;
-
-	qp->client_ready = true;
-
-	nt = qp->transport;
-	ntb_printf(2, "qp client ready\n");
-
-	if (qp->transport->link_is_up)
-		callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+	return (error);
 }
 
-
-
-/* Transport Tx */
-
-/**
- * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
- * @qp: NTB transport layer queue the entry is to be enqueued on
- * @cb: per buffer pointer for callback function to use
- * @data: pointer to data buffer that will be sent
- * @len: length of the data buffer
- *
- * Enqueue a new transmit buffer onto the transport queue from which a NTB
- * payload will be transmitted.  This assumes that a lock is being held to
- * serialize access to the qp.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
 static int
-ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-    unsigned int len)
+ntb_ifmedia_upd(struct ifnet *ifp)
 {
-	struct ntb_queue_entry *entry;
-	int rc;
+	struct ntb_net_ctx *sc = ifp->if_softc;
+	struct ifmedia *ifm = &sc->media;
 
-	if (qp == NULL || !qp->link_is_up || len == 0) {
-		CTR0(KTR_NTB, "TX: link not up");
+	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
 		return (EINVAL);
-	}
-
-	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
-	if (entry == NULL) {
-		CTR0(KTR_NTB, "TX: could not get entry from tx_free_q");
-		qp->tx_err_no_buf++;
-		return (EBUSY);
-	}
-	CTR1(KTR_NTB, "TX: got entry %p from tx_free_q", entry);
-
-	entry->cb_data = cb;
-	entry->buf = data;
-	entry->len = len;
-	entry->flags = 0;
-
-	rc = ntb_process_tx(qp, entry);
-	if (rc != 0) {
-		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
-		CTR1(KTR_NTB,
-		    "TX: process_tx failed. Returning entry %p to tx_free_q",
-		    entry);
-	}
-	return (rc);
-}
-
-static int
-ntb_process_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
-{
-	void *offset;
-
-	offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
-	CTR3(KTR_NTB,
-	    "TX: process_tx: tx_pkts=%lu, tx_index=%u, remote entry=%u",
-	    qp->tx_pkts, qp->tx_index, qp->remote_rx_info->entry);
-	if (qp->tx_index == qp->remote_rx_info->entry) {
-		CTR0(KTR_NTB, "TX: ring full");
-		qp->tx_ring_full++;
-		return (EAGAIN);
-	}
-
-	if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
-		if (qp->tx_handler != NULL)
-			qp->tx_handler(qp, qp->cb_data, entry->buf,
-			    EIO);
-		else
-			m_freem(entry->buf);
-
-		entry->buf = NULL;
-		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
-		CTR1(KTR_NTB,
-		    "TX: frame too big. returning entry %p to tx_free_q",
-		    entry);
-		return (0);
-	}
-	CTR2(KTR_NTB, "TX: copying entry %p to offset %p", entry, offset);
-	ntb_memcpy_tx(qp, entry, offset);
-
-	qp->tx_index++;
-	qp->tx_index %= qp->tx_max_entry;
-
-	qp->tx_pkts++;
 
 	return (0);
 }
 
 static void
-ntb_memcpy_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
-    void *offset)
-{
-	struct ntb_payload_header *hdr;
-
-	/* This piece is from Linux' ntb_async_tx() */
-	hdr = (struct ntb_payload_header *)((char *)offset + qp->tx_max_frame -
-	    sizeof(struct ntb_payload_header));
-	entry->x_hdr = hdr;
-	iowrite32(entry->len, &hdr->len);
-	iowrite32(qp->tx_pkts, &hdr->ver);
-
-	/* This piece is ntb_memcpy_tx() */
-	CTR2(KTR_NTB, "TX: copying %d bytes to offset %p", entry->len, offset);
-	if (entry->buf != NULL) {
-		m_copydata((struct mbuf *)entry->buf, 0, entry->len, offset);
-
-		/*
-		 * Ensure that the data is fully copied before setting the
-		 * flags
-		 */
-		wmb();
-	}
-
-	/* The rest is ntb_tx_copy_callback() */
-	iowrite32(entry->flags | IF_NTB_DESC_DONE_FLAG, &hdr->flags);
-	CTR1(KTR_NTB, "TX: hdr %p set DESC_DONE", hdr);
-
-	ntb_peer_db_set(qp->ntb, 1ull << qp->qp_num);
-
-	/*
-	 * The entry length can only be zero if the packet is intended to be a
-	 * "link down" or similar.  Since no payload is being sent in these
-	 * cases, there is nothing to add to the completion queue.
-	 */
-	if (entry->len > 0) {
-		qp->tx_bytes += entry->len;
-
-		if (qp->tx_handler)
-			qp->tx_handler(qp, qp->cb_data, entry->buf,
-			    entry->len);
-		else
-			m_freem(entry->buf);
-		entry->buf = NULL;
-	}
-
-	CTR3(KTR_NTB,
-	    "TX: entry %p sent. hdr->ver = %u, hdr->flags = 0x%x, Returning "
-	    "to tx_free_q", entry, hdr->ver, hdr->flags);
-	ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
-}
-
-static void
-ntb_qp_full(void *arg)
+ntb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
+	struct ntb_net_ctx *sc = ifp->if_softc;
 
-	CTR0(KTR_NTB, "TX: qp_full callout");
-	ntb_start(arg);
+	ifmr->ifm_status = IFM_AVALID;
+	ifmr->ifm_active = NTB_MEDIATYPE;
+	if (ntb_transport_link_query(sc->queues[0].qp))
+		ifmr->ifm_status |= IFM_ACTIVE;
 }
 
-/* Transport Rx */
 static void
-ntb_transport_rxc_db(void *arg, int pending __unused)
+ntb_transmit_locked(struct ntb_net_queue *q)
 {
-	struct ntb_transport_qp *qp = arg;
-	ntb_q_idx_t i;
-	int rc;
-
-	/*
-	 * Limit the number of packets processed in a single interrupt to
-	 * provide fairness to others
-	 */
-	CTR0(KTR_NTB, "RX: transport_rx");
-	mtx_lock(&qp->transport->rx_lock);
-	for (i = 0; i < qp->rx_max_entry; i++) {
-		rc = ntb_process_rxc(qp);
+	struct ifnet *ifp = q->ifp;
+	struct mbuf *m;
+	int rc, len;
+	short mflags;
+
+	CTR0(KTR_NTB, "TX: ntb_transmit_locked");
+	while ((m = drbr_peek(ifp, q->br)) != NULL) {
+		CTR1(KTR_NTB, "TX: start mbuf %p", m);
+		ETHER_BPF_MTAP(ifp, m);
+		len = m->m_pkthdr.len;
+		mflags = m->m_flags;
+		rc = ntb_transport_tx_enqueue(q->qp, m, m, len);
 		if (rc != 0) {
-			CTR0(KTR_NTB, "RX: process_rxc failed");
+			CTR2(KTR_NTB, "TX: could not tx mbuf %p: %d", m, rc);
+			if (rc == EAGAIN) {
+				drbr_putback(ifp, q->br, m);
+				callout_reset_sbt(&q->queue_full,
+				    SBT_1MS / 4, SBT_1MS / 4,
+				    ntb_qp_full, q, 0);
+			} else {
+				m_freem(m);
+				drbr_advance(ifp, q->br);
+				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+			}
 			break;
 		}
-	}
-	mtx_unlock(&qp->transport->rx_lock);
-
-	if (i == qp->rx_max_entry)
-		taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
-	else if ((ntb_db_read(qp->ntb) & (1ull << qp->qp_num)) != 0) {
-		/* If db is set, clear it and read it back to commit clear. */
-		ntb_db_clear(qp->ntb, 1ull << qp->qp_num);
-		(void)ntb_db_read(qp->ntb);
-
-		/*
-		 * An interrupt may have arrived between finishing
-		 * ntb_process_rxc and clearing the doorbell bit: there might
-		 * be some more work to do.
-		 */
-		taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
+		drbr_advance(ifp, q->br);
+		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
+		if (mflags & M_MCAST)
+			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	}
 }
 
 static int
-ntb_process_rxc(struct ntb_transport_qp *qp)
+ntb_transmit(struct ifnet *ifp, struct mbuf *m)
 {
-	struct ntb_payload_header *hdr;
-	struct ntb_queue_entry *entry;
-	caddr_t offset;
-
-	offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
-	hdr = (void *)(offset + qp->rx_max_frame -
-	    sizeof(struct ntb_payload_header));
-
-	CTR1(KTR_NTB, "RX: process_rxc rx_index = %u", qp->rx_index);
-	if ((hdr->flags & IF_NTB_DESC_DONE_FLAG) == 0) {
-		CTR0(KTR_NTB, "RX: hdr not done");
-		qp->rx_ring_empty++;
-		return (EAGAIN);
-	}
-
-	if ((hdr->flags & IF_NTB_LINK_DOWN_FLAG) != 0) {
-		CTR0(KTR_NTB, "RX: link down");
-		ntb_qp_link_down(qp);
-		hdr->flags = 0;
-		return (EAGAIN);
-	}
-
-	if (hdr->ver != (uint32_t)qp->rx_pkts) {
-		CTR2(KTR_NTB,"RX: ver != rx_pkts (%x != %lx). "
-		    "Returning entry to rx_pend_q", hdr->ver, qp->rx_pkts);
-		qp->rx_err_ver++;
-		return (EIO);
-	}
-
-	entry = ntb_list_mv(&qp->ntb_rx_q_lock, &qp->rx_pend_q, &qp->rx_post_q);
-	if (entry == NULL) {
-		qp->rx_err_no_buf++;
-		CTR0(KTR_NTB, "RX: No entries in rx_pend_q");
-		return (EAGAIN);
-	}
-	callout_stop(&qp->rx_full);
-	CTR1(KTR_NTB, "RX: rx entry %p from rx_pend_q", entry);
-
-	entry->x_hdr = hdr;
-	entry->index = qp->rx_index;
-
-	if (hdr->len > entry->len) {
-		CTR2(KTR_NTB, "RX: len too long. Wanted %ju got %ju",
-		    (uintmax_t)hdr->len, (uintmax_t)entry->len);
-		qp->rx_err_oflow++;
-
-		entry->len = -EIO;
-		entry->flags |= IF_NTB_DESC_DONE_FLAG;
+	struct ntb_net_ctx *sc = ifp->if_softc;
+	struct ntb_net_queue *q;
+	int error, i;
 
-		taskqueue_enqueue(taskqueue_swi, &qp->rx_completion_task);
-	} else {
-		qp->rx_bytes += hdr->len;
-		qp->rx_pkts++;
-
-		CTR1(KTR_NTB, "RX: received %ld rx_pkts", qp->rx_pkts);
-
-		entry->len = hdr->len;
-
-		ntb_memcpy_rx(qp, entry, offset);
-	}
-
-	qp->rx_index++;
-	qp->rx_index %= qp->rx_max_entry;
+	CTR0(KTR_NTB, "TX: ntb_transmit");
+	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+		i = m->m_pkthdr.flowid % sc->num_queues;
+	else
+		i = curcpu % sc->num_queues;
+	q = &sc->queues[i];
+
+	error = drbr_enqueue(ifp, q->br, m);
+	if (error)
+		return (error);
+
+	if (mtx_trylock(&q->tx_lock)) {
+		ntb_transmit_locked(q);
+		mtx_unlock(&q->tx_lock);
+	} else
+		taskqueue_enqueue(q->tx_tq, &q->tx_task);
 	return (0);
 }
 
 static void
-ntb_memcpy_rx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
-    void *offset)
+ntb_handle_tx(void *arg, int pending)
 {
-	struct ifnet *ifp = entry->cb_data;
-	unsigned int len = entry->len;
-	struct mbuf *m;
-
-	CTR2(KTR_NTB, "RX: copying %d bytes from offset %p", len, offset);
-	m = m_devget(offset, len, 0, ifp, NULL);
-	m->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
+	struct ntb_net_queue *q = arg;
 
-	entry->buf = (void *)m;
-
-	/* Ensure that the data is globally visible before clearing the flag */
-	wmb();
-
-	CTR2(KTR_NTB, "RX: copied entry %p to mbuf %p.", entry, m);
-	ntb_rx_copy_callback(qp, entry);
+	mtx_lock(&q->tx_lock);
+	ntb_transmit_locked(q);
+	mtx_unlock(&q->tx_lock);
 }
 
-static inline void
-ntb_rx_copy_callback(struct ntb_transport_qp *qp, void *data)
+static void
+ntb_qp_full(void *arg)
 {
-	struct ntb_queue_entry *entry;
+	struct ntb_net_queue *q = arg;
 
-	entry = data;
-	entry->flags |= IF_NTB_DESC_DONE_FLAG;
-	taskqueue_enqueue(taskqueue_swi, &qp->rx_completion_task);
+	CTR0(KTR_NTB, "TX: qp_full callout");
+	if (ntb_transport_tx_free_entry(q->qp) > 0)
+		taskqueue_enqueue(q->tx_tq, &q->tx_task);
+	else
+		callout_schedule_sbt(&q->queue_full,
+		    SBT_1MS / 4, SBT_1MS / 4, 0);
 }
 
 static void
-ntb_complete_rxc(void *arg, int pending)
+ntb_qflush(struct ifnet *ifp)
 {
-	struct ntb_transport_qp *qp = arg;
-	struct ntb_queue_entry *entry;
+	struct ntb_net_ctx *sc = ifp->if_softc;
+	struct ntb_net_queue *q;
 	struct mbuf *m;
-	unsigned len;
-
-	CTR0(KTR_NTB, "RX: rx_completion_task");
-
-	mtx_lock_spin(&qp->ntb_rx_q_lock);
-
-	while (!STAILQ_EMPTY(&qp->rx_post_q)) {
-		entry = STAILQ_FIRST(&qp->rx_post_q);
-		if ((entry->flags & IF_NTB_DESC_DONE_FLAG) == 0)
-			break;
-
-		entry->x_hdr->flags = 0;
-		iowrite32(entry->index, &qp->rx_info->entry);
-
-		STAILQ_REMOVE_HEAD(&qp->rx_post_q, entry);
-
-		len = entry->len;
-		m = entry->buf;
-
-		/*
-		 * Re-initialize queue_entry for reuse; rx_handler takes
-		 * ownership of the mbuf.
-		 */
-		entry->buf = NULL;
-		entry->len = transport_mtu;
-		entry->cb_data = qp->transport->ifp;
-
-		STAILQ_INSERT_TAIL(&qp->rx_pend_q, entry, entry);
-
-		mtx_unlock_spin(&qp->ntb_rx_q_lock);
+	int i;
 
-		CTR2(KTR_NTB, "RX: completing entry %p, mbuf %p", entry, m);
-		if (qp->rx_handler != NULL && qp->client_ready)
-			qp->rx_handler(qp, qp->cb_data, m, len);
-		else
+	for (i = 0; i < sc->num_queues; i++) {
+		q = &sc->queues[i];
+		mtx_lock(&q->tx_lock);
+		while ((m = buf_ring_dequeue_sc(q->br)) != NULL)
 			m_freem(m);
-
-		mtx_lock_spin(&qp->ntb_rx_q_lock);
-	}
-
-	mtx_unlock_spin(&qp->ntb_rx_q_lock);
-}
-
-static void
-ntb_transport_doorbell_callback(void *data, uint32_t vector)
-{
-	struct ntb_transport_ctx *nt = data;
-	struct ntb_transport_qp *qp;
-	struct _qpset db_bits;
-	uint64_t vec_mask;
-	unsigned qp_num;
-
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &db_bits);
-	BIT_NAND(QP_SETSIZE, &db_bits, &nt->qp_bitmap_free);
-
-	vec_mask = ntb_db_vector_mask(nt->ntb, vector);
-	while (vec_mask != 0) {
-		qp_num = ffsll(vec_mask) - 1;
-
-		if (test_bit(qp_num, &db_bits)) {
-			qp = &nt->qp_vec[qp_num];
-			taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
-		}
-
-		vec_mask &= ~(1ull << qp_num);
-	}
-}
-
-/* Link Event handler */
-static void
-ntb_transport_event_callback(void *data)
-{
-	struct ntb_transport_ctx *nt = data;
-
-	if (ntb_link_is_up(nt->ntb, NULL, NULL)) {
-		ntb_printf(1, "HW link up\n");
-		callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
-	} else {
-		ntb_printf(1, "HW link down\n");
-		taskqueue_enqueue(taskqueue_swi, &nt->link_cleanup);
+		mtx_unlock(&q->tx_lock);
 	}
+	if_qflush(ifp);
 }
 
-/* Link bring up */
+/* Network Device Callbacks */
 static void
-ntb_transport_link_work(void *arg)
-{
-	struct ntb_transport_ctx *nt = arg;
-	struct ntb_softc *ntb = nt->ntb;
-	struct ntb_transport_qp *qp;
-	uint64_t val64, size;
-	uint32_t val;
-	unsigned i;
-	int rc;
-
-	/* send the local info, in the opposite order of the way we read it */
-	for (i = 0; i < nt->mw_count; i++) {
-		size = nt->mw_vec[i].phys_size;
-
-		if (max_mw_size != 0 && size > max_mw_size)
-			size = max_mw_size;
-
-		ntb_peer_spad_write(ntb, IF_NTB_MW0_SZ_HIGH + (i * 2),
-		    size >> 32);
-		ntb_peer_spad_write(ntb, IF_NTB_MW0_SZ_LOW + (i * 2), size);
-	}
-
-	ntb_peer_spad_write(ntb, IF_NTB_NUM_MWS, nt->mw_count);
-
-	ntb_peer_spad_write(ntb, IF_NTB_NUM_QPS, nt->qp_count);
-
-	ntb_peer_spad_write(ntb, IF_NTB_VERSION, NTB_TRANSPORT_VERSION);
-
-	/* Query the remote side for its info */
-	val = 0;
-	ntb_spad_read(ntb, IF_NTB_VERSION, &val);
-	if (val != NTB_TRANSPORT_VERSION)
-		goto out;
-
-	ntb_spad_read(ntb, IF_NTB_NUM_QPS, &val);
-	if (val != nt->qp_count)
-		goto out;
-
-	ntb_spad_read(ntb, IF_NTB_NUM_MWS, &val);
-	if (val != nt->mw_count)
-		goto out;
-
-	for (i = 0; i < nt->mw_count; i++) {
-		ntb_spad_read(ntb, IF_NTB_MW0_SZ_HIGH + (i * 2), &val);
-		val64 = (uint64_t)val << 32;
-
-		ntb_spad_read(ntb, IF_NTB_MW0_SZ_LOW + (i * 2), &val);
-		val64 |= val;
-
-		rc = ntb_set_mw(nt, i, val64);
-		if (rc != 0)
-			goto free_mws;
-	}
-
-	nt->link_is_up = true;
-	ntb_printf(1, "transport link up\n");
-
-	for (i = 0; i < nt->qp_count; i++) {
-		qp = &nt->qp_vec[i];
-
-		ntb_transport_setup_qp_mw(nt, i);
-
-		if (qp->client_ready)
-			callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
-	}
-
-	return;
-
-free_mws:
-	for (i = 0; i < nt->mw_count; i++)
-		ntb_free_mw(nt, i);
-out:
-	if (ntb_link_is_up(ntb, NULL, NULL))
-		callout_reset(&nt->link_work,
-		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_transport_link_work, nt);
-}
-
-static int
-ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, size_t size)
+ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
+    int len)
 {
-	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
-	size_t xlat_size, buff_size;
-	int rc;
-
-	if (size == 0)
-		return (EINVAL);
-
-	xlat_size = roundup(size, mw->xlat_align_size);
-	buff_size = xlat_size;
-
-	/* No need to re-setup */
-	if (mw->xlat_size == xlat_size)
-		return (0);
-
-	if (mw->buff_size != 0)
-		ntb_free_mw(nt, num_mw);
-
-	/* Alloc memory for receiving data.  Must be aligned */
-	mw->xlat_size = xlat_size;
-	mw->buff_size = buff_size;
-
-	mw->virt_addr = contigmalloc(mw->buff_size, M_NTB_IF, M_ZERO, 0,
-	    mw->addr_limit, mw->xlat_align, 0);
-	if (mw->virt_addr == NULL) {
-		ntb_printf(0, "Unable to allocate MW buffer of size %zu/%zu\n",
-		    mw->buff_size, mw->xlat_size);
-		mw->xlat_size = 0;
-		mw->buff_size = 0;
-		return (ENOMEM);
-	}
-	/* TODO: replace with bus_space_* functions */
-	mw->dma_addr = vtophys(mw->virt_addr);
-
-	/*
-	 * Ensure that the allocation from contigmalloc is aligned as
-	 * requested.  XXX: This may not be needed -- brought in for parity
-	 * with the Linux driver.
-	 */
-	if (mw->dma_addr % mw->xlat_align != 0) {
-		ntb_printf(0,
-		    "DMA memory 0x%jx not aligned to BAR size 0x%zx\n",
-		    (uintmax_t)mw->dma_addr, size);
-		ntb_free_mw(nt, num_mw);
-		return (ENOMEM);
-	}
-
-	/* Notify HW the memory location of the receive buffer */
-	rc = ntb_mw_set_trans(nt->ntb, num_mw, mw->dma_addr, mw->xlat_size);
-	if (rc) {
-		ntb_printf(0, "Unable to set mw%d translation\n", num_mw);
-		ntb_free_mw(nt, num_mw);
-		return (rc);
-	}
 
-	return (0);
+	m_freem(data);
+	CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
 }
 
 static void
-ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
+ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
+    int len)
 {
-	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	struct ntb_net_queue *q = qp_data;
+	struct ntb_net_ctx *sc = q->sc;
+	struct mbuf *m = data;
+	struct ifnet *ifp = q->ifp;
+	uint16_t proto;
 
-	if (mw->virt_addr == NULL)
+	CTR1(KTR_NTB, "RX: rx handler (%d)", len);
+	if (len < 0) {
+		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return;
-
-	ntb_mw_clear_trans(nt->ntb, num_mw);
-	contigfree(mw->virt_addr, mw->xlat_size, M_NTB_IF);
-	mw->xlat_size = 0;
-	mw->buff_size = 0;
-	mw->virt_addr = NULL;
-}
-
-static int
-ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, unsigned int qp_num)
-{
-	struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
-	struct ntb_transport_mw *mw;
-	void *offset;
-	ntb_q_idx_t i;
-	size_t rx_size;
-	unsigned num_qps_mw, mw_num, mw_count;
-
-	mw_count = nt->mw_count;
-	mw_num = QP_TO_MW(nt, qp_num);
-	mw = &nt->mw_vec[mw_num];
-
-	if (mw->virt_addr == NULL)
-		return (ENOMEM);
-
-	if (nt->qp_count % mw_count && mw_num + 1 < nt->qp_count / mw_count)
-		num_qps_mw = nt->qp_count / mw_count + 1;
-	else
-		num_qps_mw = nt->qp_count / mw_count;
-
-	rx_size = mw->xlat_size / num_qps_mw;
-	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
-	rx_size -= sizeof(struct ntb_rx_info);
-
-	qp->remote_rx_info = (void*)(qp->rx_buff + rx_size);
-
-	/* Due to house-keeping, there must be at least 2 buffs */
-	qp->rx_max_frame = qmin(rx_size / 2,
-	    transport_mtu + sizeof(struct ntb_payload_header));
-	qp->rx_max_entry = rx_size / qp->rx_max_frame;
-	qp->rx_index = 0;
-
-	qp->remote_rx_info->entry = qp->rx_max_entry - 1;
-
-	/* Set up the hdr offsets with 0s */
-	for (i = 0; i < qp->rx_max_entry; i++) {
-		offset = (void *)(qp->rx_buff + qp->rx_max_frame * (i + 1) -
-		    sizeof(struct ntb_payload_header));
-		memset(offset, 0, sizeof(struct ntb_payload_header));
 	}
 
-	qp->rx_pkts = 0;
-	qp->tx_pkts = 0;
-	qp->tx_index = 0;
-
-	return (0);
-}
-
-static void
-ntb_qp_link_work(void *arg)
-{
-	struct ntb_transport_qp *qp = arg;
-	struct ntb_softc *ntb = qp->ntb;
-	struct ntb_transport_ctx *nt = qp->transport;
-	uint32_t val, dummy;
-
-	ntb_spad_read(ntb, IF_NTB_QP_LINKS, &val);
-
-	ntb_peer_spad_write(ntb, IF_NTB_QP_LINKS, val | (1ull << qp->qp_num));
-
-	/* query remote spad for qp ready bits */
-	ntb_peer_spad_read(ntb, IF_NTB_QP_LINKS, &dummy);
-
-	/* See if the remote side is up */
-	if ((val & (1ull << qp->qp_num)) != 0) {
-		ntb_printf(2, "qp link up\n");
-		qp->link_is_up = true;
-
-		if (qp->event_handler != NULL)
-			qp->event_handler(qp->cb_data, NTB_LINK_UP);
-
-		taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
-	} else if (nt->link_is_up)
-		callout_reset(&qp->link_work,
-		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
-}
-
-/* Link down event*/
-static void
-ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
-{
-	struct ntb_transport_qp *qp;
-	struct _qpset qp_bitmap_alloc;
-	unsigned i;
-
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
-	BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
-	/* Pass along the info to any clients */
-	for (i = 0; i < nt->qp_count; i++)
-		if (test_bit(i, &qp_bitmap_alloc)) {
-			qp = &nt->qp_vec[i];
-			ntb_qp_link_cleanup(qp);
-			callout_drain(&qp->link_work);
-		}
-
-	if (!nt->link_is_up)
-		callout_drain(&nt->link_work);
-
-	/*
-	 * The scratchpad registers keep the values if the remote side
-	 * goes down, blast them now to give them a sane value the next
-	 * time they are accessed
-	 */
-	for (i = 0; i < IF_NTB_MAX_SPAD; i++)
-		ntb_spad_write(nt->ntb, i, 0);
-}
-
-static void
-ntb_transport_link_cleanup_work(void *arg, int pending __unused)
-{
-
-	ntb_transport_link_cleanup(arg);
-}
-
-static void
-ntb_qp_link_down(struct ntb_transport_qp *qp)
-{
-
-	ntb_qp_link_cleanup(qp);
-}
-
-static void
-ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
-{
-
-	qp->link_is_up = false;
-
-	qp->tx_index = qp->rx_index = 0;
-	qp->tx_bytes = qp->rx_bytes = 0;
-	qp->tx_pkts = qp->rx_pkts = 0;
-
-	qp->rx_ring_empty = 0;
-	qp->tx_ring_full = 0;
-
-	qp->rx_err_no_buf = qp->tx_err_no_buf = 0;
-	qp->rx_err_oflow = qp->rx_err_ver = 0;
-}
-
-static void
-ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
-{
-	struct ntb_transport_ctx *nt = qp->transport;
-
-	callout_drain(&qp->link_work);
-	ntb_qp_link_down_reset(qp);
-
-	if (qp->event_handler != NULL)
-		qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
-
-	if (nt->link_is_up)
-		callout_reset(&qp->link_work,
-		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
-}
-
-/* Link commanded down */
-/**
- * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
- * @qp: NTB transport layer queue to be disabled
- *
- * Notify NTB transport layer of client's desire to no longer receive data on
- * transport queue specified.  It is the client's responsibility to ensure all
- * entries on queue are purged or otherwise handled appropriately.
- */
-static void
-ntb_transport_link_down(struct ntb_transport_qp *qp)
-{
-	uint32_t val;
-
-	if (qp == NULL)
-		return;
-
-	qp->client_ready = false;
-
-	ntb_spad_read(qp->ntb, IF_NTB_QP_LINKS, &val);
-
-	ntb_peer_spad_write(qp->ntb, IF_NTB_QP_LINKS,
-	   val & ~(1 << qp->qp_num));
-
-	if (qp->link_is_up)
-		ntb_send_link_down(qp);
-	else
-		callout_drain(&qp->link_work);
-}
-
-static void
-ntb_send_link_down(struct ntb_transport_qp *qp)
-{
-	struct ntb_queue_entry *entry;
-	int i, rc;
-
-	if (!qp->link_is_up)
-		return;
-
-	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
-		entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
-		if (entry != NULL)
+	m->m_pkthdr.rcvif = ifp;
+	if (sc->num_queues > 1) {
+		m->m_pkthdr.flowid = q - sc->queues;
+		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
+	}
+	if (ifp->if_capenable & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
+		m_copydata(m, 12, 2, (void *)&proto);
+		switch (ntohs(proto)) {
+		case ETHERTYPE_IP:
+			if (ifp->if_capenable & IFCAP_RXCSUM) {
+				m->m_pkthdr.csum_data = 0xffff;
+				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
+			}
+			break;
+		case ETHERTYPE_IPV6:
+			if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
+				m->m_pkthdr.csum_data = 0xffff;
+				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
+			}
 			break;
-		pause("NTB Wait for link down", hz / 10);
+		}
 	}
-
-	if (entry == NULL)
-		return;
-
-	entry->cb_data = NULL;
-	entry->buf = NULL;
-	entry->len = 0;
-	entry->flags = IF_NTB_LINK_DOWN_FLAG;
-
-	mtx_lock(&qp->transport->tx_lock);
-	rc = ntb_process_tx(qp, entry);
-	if (rc != 0)
-		printf("ntb: Failed to send link down\n");
-	mtx_unlock(&qp->transport->tx_lock);
-
-	ntb_qp_link_down_reset(qp);
+	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
+	ifp->if_input(ifp, m);
 }
 
-
-/* List Management */
-
 static void
-ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
-    struct ntb_queue_list *list)
-{
-
-	mtx_lock_spin(lock);
-	STAILQ_INSERT_TAIL(list, entry, entry);
-	mtx_unlock_spin(lock);
-}
-
-static struct ntb_queue_entry *
-ntb_list_rm(struct mtx *lock, struct ntb_queue_list *list)
-{
-	struct ntb_queue_entry *entry;
-
-	mtx_lock_spin(lock);
-	if (STAILQ_EMPTY(list)) {
-		entry = NULL;
-		goto out;
-	}
-	entry = STAILQ_FIRST(list);
-	STAILQ_REMOVE_HEAD(list, entry);
-out:
-	mtx_unlock_spin(lock);
-
-	return (entry);
-}
-
-static struct ntb_queue_entry *
-ntb_list_mv(struct mtx *lock, struct ntb_queue_list *from,
-    struct ntb_queue_list *to)
+ntb_net_event_handler(void *data, enum ntb_link_event status)
 {
-	struct ntb_queue_entry *entry;
+	struct ntb_net_queue *q = data;
+	int new_state;
 
-	mtx_lock_spin(lock);
-	if (STAILQ_EMPTY(from)) {
-		entry = NULL;
-		goto out;
+	switch (status) {
+	case NTB_LINK_DOWN:
+		new_state = LINK_STATE_DOWN;
+		break;
+	case NTB_LINK_UP:
+		new_state = LINK_STATE_UP;
+		break;
+	default:
+		new_state = LINK_STATE_UNKNOWN;
+		break;
 	}
-	entry = STAILQ_FIRST(from);
-	STAILQ_REMOVE_HEAD(from, entry);
-	STAILQ_INSERT_TAIL(to, entry, entry);
-
-out:
-	mtx_unlock_spin(lock);
-	return (entry);
+	if_link_state_change(q->ifp, new_state);
 }
 
 /* Helper functions */
@@ -1693,27 +493,24 @@ static void
 create_random_local_eui48(u_char *eaddr)
 {
 	static uint8_t counter = 0;
-	uint32_t seed = ticks;
 
 	eaddr[0] = EUI48_LOCALLY_ADMINISTERED;
-	memcpy(&eaddr[1], &seed, sizeof(uint32_t));
+	arc4rand(&eaddr[1], 4, 0);
 	eaddr[5] = counter++;
 }
 
-/**
- * ntb_transport_max_size - Query the max payload size of a qp
- * @qp: NTB transport layer queue to be queried
- *
- * Query the maximum payload size permissible on the given qp
- *
- * RETURNS: the max payload size of a qp
- */
-static unsigned int
-ntb_transport_max_size(struct ntb_transport_qp *qp)
-{
-
-	if (qp == NULL)
-		return (0);
+static device_method_t ntb_net_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,     ntb_net_probe),
+	DEVMETHOD(device_attach,    ntb_net_attach),
+	DEVMETHOD(device_detach,    ntb_net_detach),
+	DEVMETHOD_END
+};
 
-	return (qp->tx_max_frame - sizeof(struct ntb_payload_header));
-}
+devclass_t ntb_net_devclass;
+static DEFINE_CLASS_0(ntb, ntb_net_driver, ntb_net_methods,
+    sizeof(struct ntb_net_ctx));
+DRIVER_MODULE(if_ntb, ntb_transport, ntb_net_driver, ntb_net_devclass,
+    NULL, NULL);
+MODULE_DEPEND(if_ntb, ntb_transport, 1, 1, 1);
+MODULE_VERSION(if_ntb, 1);
diff --git a/sys/dev/ntb/ntb.c b/sys/dev/ntb/ntb.c
new file mode 100644
index 0000000..1cf1ba2
--- /dev/null
+++ b/sys/dev/ntb/ntb.c
@@ -0,0 +1,463 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+
+#include "ntb.h"
+
+devclass_t ntb_hw_devclass;
+SYSCTL_NODE(_hw, OID_AUTO, ntb, CTLFLAG_RW, 0, "NTB sysctls");
+
+struct ntb_child {
+	device_t	dev;
+	int		enabled;
+	int		mwoff;
+	int		mwcnt;
+	int		spadoff;
+	int		spadcnt;
+	int		dboff;
+	int		dbmask;
+	void		*ctx;
+	const struct ntb_ctx_ops *ctx_ops;
+	struct rmlock	ctx_lock;
+	struct ntb_child *next;
+};
+
+int
+ntb_register_device(device_t dev)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	int i, mw, mwu, mwt, spad, spadu, spadt, db, dbu, dbt;
+	char cfg[128] = "";
+	char buf[32];
+	char *n, *np, *c, *p, *name;
+
+	mwu = 0;
+	mwt = NTB_MW_COUNT(dev);
+	spadu = 0;
+	spadt = NTB_SPAD_COUNT(dev);
+	dbu = 0;
+	dbt = flsll(NTB_DB_VALID_MASK(dev));
+
+	device_printf(dev, "%d memory windows, %d scratchpads, "
+	    "%d doorbells\n", mwt, spadt, dbt);
+
+	snprintf(buf, sizeof(buf), "hint.%s.%d.config", device_get_name(dev),
+	    device_get_unit(dev));
+	TUNABLE_STR_FETCH(buf, cfg, sizeof(cfg));
+	n = cfg;
+	i = 0;
+	while ((c = strsep(&n, ",")) != NULL) {
+		np = c;
+		name = strsep(&np, ":");
+		if (name != NULL && name[0] == 0)
+			name = NULL;
+		p = strsep(&np, ":");
+		mw = (p && p[0] != 0) ? strtol(p, NULL, 10) : mwt - mwu;
+		p = strsep(&np, ":");
+		spad = (p && p[0] != 0) ? strtol(p, NULL, 10) : spadt - spadu;
+		db = (np && np[0] != 0) ? strtol(np, NULL, 10) : dbt - dbu;
+
+		if (mw > mwt - mwu || spad > spadt - spadu || db > dbt - dbu) {
+			device_printf(dev, "Not enough resources for config\n");
+			break;
+		}
+
+		nc = malloc(sizeof(*nc), M_DEVBUF, M_WAITOK | M_ZERO);
+		nc->mwoff = mwu;
+		nc->mwcnt = mw;
+		nc->spadoff = spadu;
+		nc->spadcnt = spad;
+		nc->dboff = dbu;
+		nc->dbmask = (db == 0) ? 0 : (0xffffffffffffffff >> (64 - db));
+		rm_init(&nc->ctx_lock, "ntb ctx");
+		nc->dev = device_add_child(dev, name, -1);
+		if (nc->dev == NULL) {
+			ntb_unregister_device(dev);
+			return (ENOMEM);
+		}
+		device_set_ivars(nc->dev, nc);
+		*cpp = nc;
+		cpp = &nc->next;
+
+		if (bootverbose) {
+			device_printf(dev, "%d \"%s\":", i, name);
+			if (mw > 0) {
+				printf(" memory windows %d", mwu);
+				if (mw > 1)
+					printf("-%d", mwu + mw - 1);
+			}
+			if (spad > 0) {
+				printf(" scratchpads %d", spadu);
+				if (spad > 1)
+					printf("-%d", spadu + spad - 1);
+			}
+			if (db > 0) {
+				printf(" doorbells %d", dbu);
+				if (db > 1)
+					printf("-%d", dbu + db - 1);
+			}
+			printf("\n");
+		}
+
+		mwu += mw;
+		spadu += spad;
+		dbu += db;
+		i++;
+	}
+
+	bus_generic_attach(dev);
+	return (0);
+}
+
+int
+ntb_unregister_device(device_t dev)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	int error = 0;
+
+	while ((nc = *cpp) != NULL) {
+		*cpp = (*cpp)->next;
+		error = device_delete_child(dev, nc->dev);
+		if (error)
+			break;
+		rm_destroy(&nc->ctx_lock);
+		free(nc, M_DEVBUF);
+	}
+	return (error);
+}
+
+void
+ntb_link_event(device_t dev)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	struct rm_priotracker ctx_tracker;
+
+	for (nc = *cpp; nc != NULL; nc = nc->next) {
+		rm_rlock(&nc->ctx_lock, &ctx_tracker);
+		if (nc->ctx_ops != NULL && nc->ctx_ops->link_event != NULL)
+			nc->ctx_ops->link_event(nc->ctx);
+		rm_runlock(&nc->ctx_lock, &ctx_tracker);
+	}
+}
+
+void
+ntb_db_event(device_t dev, uint32_t vec)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	struct rm_priotracker ctx_tracker;
+
+	for (nc = *cpp; nc != NULL; nc = nc->next) {
+		rm_rlock(&nc->ctx_lock, &ctx_tracker);
+		if (nc->ctx_ops != NULL && nc->ctx_ops->db_event != NULL)
+			nc->ctx_ops->db_event(nc->ctx, vec);
+		rm_runlock(&nc->ctx_lock, &ctx_tracker);
+	}
+}
+
+bool
+ntb_link_is_up(device_t ntb, enum ntb_speed *speed, enum ntb_width *width)
+{
+
+	return (NTB_LINK_IS_UP(device_get_parent(ntb), speed, width));
+}
+
+int
+ntb_link_enable(device_t ntb, enum ntb_speed speed, enum ntb_width width)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+	struct ntb_child **cpp = device_get_softc(device_get_parent(nc->dev));
+	struct ntb_child *nc1;
+
+	for (nc1 = *cpp; nc1 != NULL; nc1 = nc1->next) {
+		if (nc1->enabled) {
+			nc->enabled = 1;
+			return (0);
+		}
+	}
+	nc->enabled = 1;
+	return (NTB_LINK_ENABLE(device_get_parent(ntb), speed, width));
+}
+
+int
+ntb_link_disable(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+	struct ntb_child **cpp = device_get_softc(device_get_parent(nc->dev));
+	struct ntb_child *nc1;
+
+	if (!nc->enabled)
+		return (0);
+	nc->enabled = 0;
+	for (nc1 = *cpp; nc1 != NULL; nc1 = nc1->next) {
+		if (nc1->enabled)
+			return (0);
+	}
+	return (NTB_LINK_DISABLE(device_get_parent(ntb)));
+}
+
+bool
+ntb_link_enabled(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->enabled && NTB_LINK_ENABLED(device_get_parent(ntb)));
+}
+
+int
+ntb_set_ctx(device_t ntb, void *ctx, const struct ntb_ctx_ops *ctx_ops)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	if (ctx == NULL || ctx_ops == NULL)
+		return (EINVAL);
+
+	rm_wlock(&nc->ctx_lock);
+	if (nc->ctx_ops != NULL) {
+		rm_wunlock(&nc->ctx_lock);
+		return (EINVAL);
+	}
+	nc->ctx = ctx;
+	nc->ctx_ops = ctx_ops;
+	rm_wunlock(&nc->ctx_lock);
+
+	return (0);
+}
+
+void *
+ntb_get_ctx(device_t ntb, const struct ntb_ctx_ops **ctx_ops)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	KASSERT(nc->ctx != NULL && nc->ctx_ops != NULL, ("bogus"));
+	if (ctx_ops != NULL)
+		*ctx_ops = nc->ctx_ops;
+	return (nc->ctx);
+}
+
+void
+ntb_clear_ctx(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	rm_wlock(&nc->ctx_lock);
+	nc->ctx = NULL;
+	nc->ctx_ops = NULL;
+	rm_wunlock(&nc->ctx_lock);
+}
+
+uint8_t
+ntb_mw_count(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->mwcnt);
+}
+
+int
+ntb_mw_get_range(device_t ntb, unsigned mw_idx, vm_paddr_t *base,
+    caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
+    bus_addr_t *plimit)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_GET_RANGE(device_get_parent(ntb), mw_idx + nc->mwoff,
+	    base, vbase, size, align, align_size, plimit));
+}
+
+int
+ntb_mw_set_trans(device_t ntb, unsigned mw_idx, bus_addr_t addr, size_t size)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_SET_TRANS(device_get_parent(ntb), mw_idx + nc->mwoff,
+	    addr, size));
+}
+
+int
+ntb_mw_clear_trans(device_t ntb, unsigned mw_idx)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_CLEAR_TRANS(device_get_parent(ntb), mw_idx + nc->mwoff));
+}
+
+int
+ntb_mw_get_wc(device_t ntb, unsigned mw_idx, vm_memattr_t *mode)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_GET_WC(device_get_parent(ntb), mw_idx + nc->mwoff, mode));
+}
+
+int
+ntb_mw_set_wc(device_t ntb, unsigned mw_idx, vm_memattr_t mode)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_SET_WC(device_get_parent(ntb), mw_idx + nc->mwoff, mode));
+}
+
+uint8_t
+ntb_spad_count(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->spadcnt);
+}
+
+void
+ntb_spad_clear(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+	unsigned i;
+
+	for (i = 0; i < nc->spadcnt; i++)
+		NTB_SPAD_WRITE(device_get_parent(ntb), i + nc->spadoff, 0);
+}
+
+int
+ntb_spad_write(device_t ntb, unsigned int idx, uint32_t val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_SPAD_WRITE(device_get_parent(ntb), idx + nc->spadoff, val));
+}
+
+int
+ntb_spad_read(device_t ntb, unsigned int idx, uint32_t *val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_SPAD_READ(device_get_parent(ntb), idx + nc->spadoff, val));
+}
+
+int
+ntb_peer_spad_write(device_t ntb, unsigned int idx, uint32_t val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_PEER_SPAD_WRITE(device_get_parent(ntb), idx + nc->spadoff,
+	    val));
+}
+
+int
+ntb_peer_spad_read(device_t ntb, unsigned int idx, uint32_t *val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_PEER_SPAD_READ(device_get_parent(ntb), idx + nc->spadoff,
+	    val));
+}
+
+uint64_t
+ntb_db_valid_mask(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->dbmask);
+}
+
+int
+ntb_db_vector_count(device_t ntb)
+{
+
+	return (NTB_DB_VECTOR_COUNT(device_get_parent(ntb)));
+}
+
+uint64_t
+ntb_db_vector_mask(device_t ntb, uint32_t vector)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return ((NTB_DB_VECTOR_MASK(device_get_parent(ntb), vector)
+	    >> nc->dboff) & nc->dbmask);
+}
+
+int
+ntb_peer_db_addr(device_t ntb, bus_addr_t *db_addr, vm_size_t *db_size)
+{
+
+	return (NTB_PEER_DB_ADDR(device_get_parent(ntb), db_addr, db_size));
+}
+
+void
+ntb_db_clear(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_DB_CLEAR(device_get_parent(ntb), bits << nc->dboff));
+}
+
+void
+ntb_db_clear_mask(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_DB_CLEAR_MASK(device_get_parent(ntb), bits << nc->dboff));
+}
+
+uint64_t
+ntb_db_read(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return ((NTB_DB_READ(device_get_parent(ntb)) >> nc->dboff)
+	    & nc->dbmask);
+}
+
+void
+ntb_db_set_mask(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_DB_SET_MASK(device_get_parent(ntb), bits << nc->dboff));
+}
+
+void
+ntb_peer_db_set(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_PEER_DB_SET(device_get_parent(ntb), bits << nc->dboff));
+}
+
+MODULE_VERSION(ntb, 1);
diff --git a/sys/dev/ntb/ntb.h b/sys/dev/ntb/ntb.h
new file mode 100644
index 0000000..8593c65
--- /dev/null
+++ b/sys/dev/ntb/ntb.h
@@ -0,0 +1,409 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NTB_H_
+#define _NTB_H_
+
+#include "ntb_if.h"
+
+extern devclass_t ntb_hw_devclass;
+SYSCTL_DECL(_hw_ntb);
+
+int ntb_register_device(device_t ntb);
+int ntb_unregister_device(device_t ntb);
+
+/*
+ * ntb_link_event() - notify driver context of a change in link status
+ * @ntb:        NTB device context
+ *
+ * Notify the driver context that the link status may have changed.  The driver
+ * should call intb_link_is_up() to get the current status.
+ */
+void ntb_link_event(device_t ntb);
+
+/*
+ * ntb_db_event() - notify driver context of a doorbell event
+ * @ntb:        NTB device context
+ * @vector:     Interrupt vector number
+ *
+ * Notify the driver context of a doorbell event.  If hardware supports
+ * multiple interrupt vectors for doorbells, the vector number indicates which
+ * vector received the interrupt.  The vector number is relative to the first
+ * vector used for doorbells, starting at zero, and must be less than
+ * ntb_db_vector_count().  The driver may call ntb_db_read() to check which
+ * doorbell bits need service, and ntb_db_vector_mask() to determine which of
+ * those bits are associated with the vector number.
+ */
+void ntb_db_event(device_t ntb, uint32_t vec);
+
+/*
+ * ntb_link_is_up() - get the current ntb link state
+ * @ntb:        NTB device context
+ * @speed:      OUT - The link speed expressed as PCIe generation number
+ * @width:      OUT - The link width expressed as the number of PCIe lanes
+ *
+ * RETURNS: true or false based on the hardware link state
+ */
+bool ntb_link_is_up(device_t ntb, enum ntb_speed *speed, enum ntb_width *width);
+
+/*
+ * ntb_link_enable() - enable the link on the secondary side of the ntb
+ * @ntb:        NTB device context
+ * @max_speed:  The maximum link speed expressed as PCIe generation number[0]
+ * @max_width:  The maximum link width expressed as the number of PCIe lanes[0]
+ *
+ * Enable the link on the secondary side of the ntb.  This can only be done
+ * from the primary side of the ntb in primary or b2b topology.  The ntb device
+ * should train the link to its maximum speed and width, or the requested speed
+ * and width, whichever is smaller, if supported.
+ *
+ * Return: Zero on success, otherwise an error number.
+ *
+ * [0]: Only NTB_SPEED_AUTO and NTB_WIDTH_AUTO are valid inputs; other speed
+ *      and width input will be ignored.
+ */
+int ntb_link_enable(device_t ntb, enum ntb_speed speed, enum ntb_width width);
+
+/*
+ * ntb_link_disable() - disable the link on the secondary side of the ntb
+ * @ntb:        NTB device context
+ *
+ * Disable the link on the secondary side of the ntb.  This can only be done
+ * from the primary side of the ntb in primary or b2b topology.  The ntb device
+ * should disable the link.  Returning from this call must indicate that a
+ * barrier has passed, though with no more writes may pass in either direction
+ * across the link, except if this call returns an error number.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_link_disable(device_t ntb);
+
+/*
+ * get enable status of the link on the secondary side of the ntb
+ */
+bool ntb_link_enabled(device_t ntb);
+
+/*
+ * ntb_set_ctx() - associate a driver context with an ntb device
+ * @ntb:        NTB device context
+ * @ctx:        Driver context
+ * @ctx_ops:    Driver context operations
+ *
+ * Associate a driver context and operations with a ntb device.  The context is
+ * provided by the client driver, and the driver may associate a different
+ * context with each ntb device.
+ *
+ * Return: Zero if the context is associated, otherwise an error number.
+ */
+int ntb_set_ctx(device_t ntb, void *ctx, const struct ntb_ctx_ops *ctx_ops);
+
+/*
+ * ntb_set_ctx() - get a driver context associated with an ntb device
+ * @ntb:        NTB device context
+ * @ctx_ops:    Driver context operations
+ *
+ * Get a driver context and operations associated with a ntb device.
+ */
+void * ntb_get_ctx(device_t ntb, const struct ntb_ctx_ops **ctx_ops);
+
+/*
+ * ntb_clear_ctx() - disassociate any driver context from an ntb device
+ * @ntb:        NTB device context
+ *
+ * Clear any association that may exist between a driver context and the ntb
+ * device.
+ */
+void ntb_clear_ctx(device_t ntb);
+
+/*
+ * ntb_mw_count() - Get the number of memory windows available for KPI
+ * consumers.
+ *
+ * (Excludes any MW wholly reserved for register access.)
+ */
+uint8_t ntb_mw_count(device_t ntb);
+
+/*
+ * ntb_mw_get_range() - get the range of a memory window
+ * @ntb:        NTB device context
+ * @idx:        Memory window number
+ * @base:       OUT - the base address for mapping the memory window
+ * @size:       OUT - the size for mapping the memory window
+ * @align:      OUT - the base alignment for translating the memory window
+ * @align_size: OUT - the size alignment for translating the memory window
+ *
+ * Get the range of a memory window.  NULL may be given for any output
+ * parameter if the value is not needed.  The base and size may be used for
+ * mapping the memory window, to access the peer memory.  The alignment and
+ * size may be used for translating the memory window, for the peer to access
+ * memory on the local system.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_get_range(device_t ntb, unsigned mw_idx, vm_paddr_t *base,
+    caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
+    bus_addr_t *plimit);
+
+/*
+ * ntb_mw_set_trans() - set the translation of a memory window
+ * @ntb:        NTB device context
+ * @idx:        Memory window number
+ * @addr:       The dma address local memory to expose to the peer
+ * @size:       The size of the local memory to expose to the peer
+ *
+ * Set the translation of a memory window.  The peer may access local memory
+ * through the window starting at the address, up to the size.  The address
+ * must be aligned to the alignment specified by ntb_mw_get_range().  The size
+ * must be aligned to the size alignment specified by ntb_mw_get_range().  The
+ * address must be below the plimit specified by ntb_mw_get_range() (i.e. for
+ * 32-bit BARs).
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_set_trans(device_t ntb, unsigned mw_idx, bus_addr_t addr,
+    size_t size);
+
+/*
+ * ntb_mw_clear_trans() - clear the translation of a memory window
+ * @ntb:	NTB device context
+ * @idx:	Memory window number
+ *
+ * Clear the translation of a memory window.  The peer may no longer access
+ * local memory through the window.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_clear_trans(device_t ntb, unsigned mw_idx);
+
+/*
+ * ntb_mw_get_wc - Get the write-combine status of a memory window
+ *
+ * Returns:  Zero on success, setting *wc; otherwise an error number (e.g. if
+ * idx is an invalid memory window).
+ *
+ * Mode is a VM_MEMATTR_* type.
+ */
+int ntb_mw_get_wc(device_t ntb, unsigned mw_idx, vm_memattr_t *mode);
+
+/*
+ * ntb_mw_set_wc - Set the write-combine status of a memory window
+ *
+ * If 'mode' matches the current status, this does nothing and succeeds.  Mode
+ * is a VM_MEMATTR_* type.
+ *
+ * Returns:  Zero on success, setting the caching attribute on the virtual
+ * mapping of the BAR; otherwise an error number (e.g. if idx is an invalid
+ * memory window, or if changing the caching attribute fails).
+ */
+int ntb_mw_set_wc(device_t ntb, unsigned mw_idx, vm_memattr_t mode);
+
+/*
+ * ntb_spad_count() - get the total scratch regs usable
+ * @ntb: pointer to ntb_softc instance
+ *
+ * This function returns the max 32bit scratchpad registers usable by the
+ * upper layer.
+ *
+ * RETURNS: total number of scratch pad registers available
+ */
+uint8_t ntb_spad_count(device_t ntb);
+
+/*
+ * ntb_get_max_spads() - zero local scratch registers
+ * @ntb: pointer to ntb_softc instance
+ *
+ * This functions overwrites all local scratchpad registers with zeroes.
+ */
+void ntb_spad_clear(device_t ntb);
+
+/*
+ * ntb_spad_write() - write to the secondary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_spad_write(device_t ntb, unsigned int idx, uint32_t val);
+
+/*
+ * ntb_spad_read() - read from the primary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_spad_read(device_t ntb, unsigned int idx, uint32_t *val);
+
+/*
+ * ntb_peer_spad_write() - write to the secondary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_peer_spad_write(device_t ntb, unsigned int idx, uint32_t val);
+
+/*
+ * ntb_peer_spad_read() - read from the primary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_peer_spad_read(device_t ntb, unsigned int idx, uint32_t *val);
+
+/*
+ * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
+ * @ntb:	NTB device context
+ *
+ * Hardware may support different number or arrangement of doorbell bits.
+ *
+ * Return: A mask of doorbell bits supported by the ntb.
+ */
+uint64_t ntb_db_valid_mask(device_t ntb);
+
+/*
+ * ntb_db_vector_count() - get the number of doorbell interrupt vectors
+ * @ntb:	NTB device context.
+ *
+ * Hardware may support different number of interrupt vectors.
+ *
+ * Return: The number of doorbell interrupt vectors.
+ */
+int ntb_db_vector_count(device_t ntb);
+
+/*
+ * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
+ * @ntb:	NTB device context
+ * @vector:	Doorbell vector number
+ *
+ * Each interrupt vector may have a different number or arrangement of bits.
+ *
+ * Return: A mask of doorbell bits serviced by a vector.
+ */
+uint64_t ntb_db_vector_mask(device_t ntb, uint32_t vector);
+
+/*
+ * ntb_peer_db_addr() - address and size of the peer doorbell register
+ * @ntb:	NTB device context.
+ * @db_addr:	OUT - The address of the peer doorbell register.
+ * @db_size:	OUT - The number of bytes to write the peer doorbell register.
+ *
+ * Return the address of the peer doorbell register.  This may be used, for
+ * example, by drivers that offload memory copy operations to a dma engine.
+ * The drivers may wish to ring the peer doorbell at the completion of memory
+ * copy operations.  For efficiency, and to simplify ordering of operations
+ * between the dma memory copies and the ringing doorbell, the driver may
+ * append one additional dma memory copy with the doorbell register as the
+ * destination, after the memory copy operations.
+ *
+ * Return: Zero on success, otherwise an error number.
+ *
+ * Note that writing the peer doorbell via a memory window will *not* generate
+ * an interrupt on the remote host; that must be done separately.
+ */
+int ntb_peer_db_addr(device_t ntb, bus_addr_t *db_addr, vm_size_t *db_size);
+
+/*
+ * ntb_db_clear() - clear bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_clear(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_db_clear_mask() - clear bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell mask register, allowing doorbell interrupts
+ * from being generated for those doorbell bits.  If a doorbell bit is already
+ * set at the time the mask is cleared, and the corresponding mask bit is
+ * changed from set to clear, then the ntb driver must ensure that
+ * ntb_db_event() is called.  If the hardware does not generate the interrupt
+ * on clearing the mask bit, then the driver must call ntb_db_event() anyway.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_clear_mask(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_db_read() - read the local doorbell register
+ * @ntb:	NTB device context.
+ *
+ * Read the local doorbell register, and return the bits that are set.
+ *
+ * Return: The bits currently set in the local doorbell register.
+ */
+uint64_t ntb_db_read(device_t ntb);
+
+/*
+ * ntb_db_set_mask() - set bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell mask bits to set.
+ *
+ * Set bits in the local doorbell mask register, preventing doorbell interrupts
+ * from being generated for those doorbell bits.  Bits that were already set
+ * must remain set.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_set_mask(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_peer_db_set() - Set the doorbell on the secondary/external side
+ * @ntb: pointer to ntb_softc instance
+ * @bit: doorbell bits to ring
+ *
+ * This function allows triggering of a doorbell on the secondary/external
+ * side that will initiate an interrupt on the remote host
+ */
+void ntb_peer_db_set(device_t ntb, uint64_t bits);
+
+#endif /* _NTB_H_ */
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.c b/sys/dev/ntb/ntb_hw/ntb_hw.c
index b757f01..609aa4d 100644
--- a/sys/dev/ntb/ntb_hw/ntb_hw.c
+++ b/sys/dev/ntb/ntb_hw/ntb_hw.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
@@ -25,6 +26,16 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a driver for NTB hardware in Intel Xeon/Atom CPUs.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
@@ -33,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
+#include <sys/interrupt.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
@@ -51,19 +63,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/pci/pcivar.h>
 
 #include "ntb_regs.h"
-#include "ntb_hw.h"
-
-/*
- * The Non-Transparent Bridge (NTB) is a device on some Intel processors that
- * allows you to connect two systems using a PCI-e link.
- *
- * This module contains the hardware abstraction layer for the NTB. It allows
- * you to send and recieve interrupts, map the memory windows and send and
- * receive messages in the scratch-pad registers.
- *
- * NOTE: Much of the code in this module is shared with Linux. Any patches may
- * be picked up and redistributed in Linux with a dual GPL/BSD license.
- */
+#include "../ntb.h"
 
 #define MAX_MSIX_INTERRUPTS MAX(XEON_DB_COUNT, ATOM_DB_COUNT)
 
@@ -71,8 +71,6 @@ __FBSDID("$FreeBSD$");
 #define ATOM_LINK_RECOVERY_TIME	500 /* ms */
 #define BAR_HIGH_MASK		(~((1ull << 12) - 1))
 
-#define DEVICE2SOFTC(dev) ((struct ntb_softc *) device_get_softc(dev))
-
 #define	NTB_MSIX_VER_GUARD	0xaabbccdd
 #define	NTB_MSIX_RECEIVED	0xe0f0e0f0
 
@@ -123,8 +121,8 @@ enum {
 };
 
 /* Device features and workarounds */
-#define HAS_FEATURE(feature)	\
-	((ntb->features & (feature)) != 0)
+#define HAS_FEATURE(ntb, feature)	\
+	(((ntb)->features & (feature)) != 0)
 
 struct ntb_hw_info {
 	uint32_t		device_id;
@@ -203,6 +201,9 @@ struct ntb_msix_data {
 };
 
 struct ntb_softc {
+	/* ntb.c context. Do not move! Must go first! */
+	void			*ntb_store;
+
 	device_t		device;
 	enum ntb_device_type	type;
 	uint32_t		features;
@@ -221,13 +222,7 @@ struct ntb_softc {
 	struct callout		heartbeat_timer;
 	struct callout		lr_timer;
 
-	void			*ntb_ctx;
-	const struct ntb_ctx_ops *ctx_ops;
 	struct ntb_vec		*msix_vec;
-#define CTX_LOCK(sc)		mtx_lock(&(sc)->ctx_lock)
-#define CTX_UNLOCK(sc)		mtx_unlock(&(sc)->ctx_lock)
-#define CTX_ASSERT(sc,f)	mtx_assert(&(sc)->ctx_lock, (f))
-	struct mtx		ctx_lock;
 
 	uint32_t		ppd;
 	enum ntb_conn_type	conn_type;
@@ -259,6 +254,7 @@ struct ntb_softc {
 	uint64_t			db_valid_mask;
 	uint64_t			db_link_mask;
 	uint64_t			db_mask;
+	uint64_t			fake_db_bell;	/* NTB_SB01BASE_LOCKUP*/
 
 	int				last_ts;	/* ticks @ last irq */
 
@@ -288,61 +284,74 @@ bus_space_write_8(bus_space_tag_t tag, bus_space_handle_t handle,
 }
 #endif
 
-#define ntb_bar_read(SIZE, bar, offset) \
+#define intel_ntb_bar_read(SIZE, bar, offset) \
 	    bus_space_read_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
 	    ntb->bar_info[(bar)].pci_bus_handle, (offset))
-#define ntb_bar_write(SIZE, bar, offset, val) \
+#define intel_ntb_bar_write(SIZE, bar, offset, val) \
 	    bus_space_write_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
 	    ntb->bar_info[(bar)].pci_bus_handle, (offset), (val))
-#define ntb_reg_read(SIZE, offset) ntb_bar_read(SIZE, NTB_CONFIG_BAR, offset)
-#define ntb_reg_write(SIZE, offset, val) \
-	    ntb_bar_write(SIZE, NTB_CONFIG_BAR, offset, val)
-#define ntb_mw_read(SIZE, offset) \
-	    ntb_bar_read(SIZE, ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), offset)
-#define ntb_mw_write(SIZE, offset, val) \
-	    ntb_bar_write(SIZE, ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
+#define intel_ntb_reg_read(SIZE, offset) \
+	    intel_ntb_bar_read(SIZE, NTB_CONFIG_BAR, offset)
+#define intel_ntb_reg_write(SIZE, offset, val) \
+	    intel_ntb_bar_write(SIZE, NTB_CONFIG_BAR, offset, val)
+#define intel_ntb_mw_read(SIZE, offset) \
+	    intel_ntb_bar_read(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
+		offset)
+#define intel_ntb_mw_write(SIZE, offset, val) \
+	    intel_ntb_bar_write(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
 		offset, val)
 
-static int ntb_probe(device_t device);
-static int ntb_attach(device_t device);
-static int ntb_detach(device_t device);
-static unsigned ntb_user_mw_to_idx(struct ntb_softc *, unsigned uidx);
-static inline enum ntb_bar ntb_mw_to_bar(struct ntb_softc *, unsigned mw);
+static int intel_ntb_probe(device_t device);
+static int intel_ntb_attach(device_t device);
+static int intel_ntb_detach(device_t device);
+static uint64_t intel_ntb_db_valid_mask(device_t dev);
+static void intel_ntb_spad_clear(device_t dev);
+static uint64_t intel_ntb_db_vector_mask(device_t dev, uint32_t vector);
+static bool intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed,
+    enum ntb_width *width);
+static int intel_ntb_link_enable(device_t dev, enum ntb_speed speed,
+    enum ntb_width width);
+static int intel_ntb_link_disable(device_t dev);
+static int intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val);
+static int intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val);
+
+static unsigned intel_ntb_user_mw_to_idx(struct ntb_softc *, unsigned uidx);
+static inline enum ntb_bar intel_ntb_mw_to_bar(struct ntb_softc *, unsigned mw);
 static inline bool bar_is_64bit(struct ntb_softc *, enum ntb_bar);
 static inline void bar_get_xlat_params(struct ntb_softc *, enum ntb_bar,
     uint32_t *base, uint32_t *xlat, uint32_t *lmt);
-static int ntb_map_pci_bars(struct ntb_softc *ntb);
-static int ntb_mw_set_wc_internal(struct ntb_softc *, unsigned idx,
+static int intel_ntb_map_pci_bars(struct ntb_softc *ntb);
+static int intel_ntb_mw_set_wc_internal(struct ntb_softc *, unsigned idx,
     vm_memattr_t);
 static void print_map_success(struct ntb_softc *, struct ntb_pci_bar_info *,
     const char *);
 static int map_mmr_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar);
 static int map_memory_window_bar(struct ntb_softc *ntb,
     struct ntb_pci_bar_info *bar);
-static void ntb_unmap_pci_bar(struct ntb_softc *ntb);
-static int ntb_remap_msix(device_t, uint32_t desired, uint32_t avail);
-static int ntb_init_isr(struct ntb_softc *ntb);
-static int ntb_setup_legacy_interrupt(struct ntb_softc *ntb);
-static int ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors);
-static void ntb_teardown_interrupts(struct ntb_softc *ntb);
-static inline uint64_t ntb_vec_mask(struct ntb_softc *, uint64_t db_vector);
-static void ntb_interrupt(struct ntb_softc *, uint32_t vec);
+static void intel_ntb_unmap_pci_bar(struct ntb_softc *ntb);
+static int intel_ntb_remap_msix(device_t, uint32_t desired, uint32_t avail);
+static int intel_ntb_init_isr(struct ntb_softc *ntb);
+static int intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb);
+static int intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors);
+static void intel_ntb_teardown_interrupts(struct ntb_softc *ntb);
+static inline uint64_t intel_ntb_vec_mask(struct ntb_softc *, uint64_t db_vector);
+static void intel_ntb_interrupt(struct ntb_softc *, uint32_t vec);
 static void ndev_vec_isr(void *arg);
 static void ndev_irq_isr(void *arg);
 static inline uint64_t db_ioread(struct ntb_softc *, uint64_t regoff);
 static inline void db_iowrite(struct ntb_softc *, uint64_t regoff, uint64_t);
 static inline void db_iowrite_raw(struct ntb_softc *, uint64_t regoff, uint64_t);
-static int ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
-static void ntb_free_msix_vec(struct ntb_softc *ntb);
-static void ntb_get_msix_info(struct ntb_softc *ntb);
-static void ntb_exchange_msix(void *);
-static struct ntb_hw_info *ntb_get_device_info(uint32_t device_id);
-static void ntb_detect_max_mw(struct ntb_softc *ntb);
-static int ntb_detect_xeon(struct ntb_softc *ntb);
-static int ntb_detect_atom(struct ntb_softc *ntb);
-static int ntb_xeon_init_dev(struct ntb_softc *ntb);
-static int ntb_atom_init_dev(struct ntb_softc *ntb);
-static void ntb_teardown_xeon(struct ntb_softc *ntb);
+static int intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
+static void intel_ntb_free_msix_vec(struct ntb_softc *ntb);
+static void intel_ntb_get_msix_info(struct ntb_softc *ntb);
+static void intel_ntb_exchange_msix(void *);
+static struct ntb_hw_info *intel_ntb_get_device_info(uint32_t device_id);
+static void intel_ntb_detect_max_mw(struct ntb_softc *ntb);
+static int intel_ntb_detect_xeon(struct ntb_softc *ntb);
+static int intel_ntb_detect_atom(struct ntb_softc *ntb);
+static int intel_ntb_xeon_init_dev(struct ntb_softc *ntb);
+static int intel_ntb_atom_init_dev(struct ntb_softc *ntb);
+static void intel_ntb_teardown_xeon(struct ntb_softc *ntb);
 static void configure_atom_secondary_side_bars(struct ntb_softc *ntb);
 static void xeon_reset_sbar_size(struct ntb_softc *, enum ntb_bar idx,
     enum ntb_bar regbar);
@@ -352,18 +361,16 @@ static void xeon_set_pbar_xlat(struct ntb_softc *, uint64_t base_addr,
     enum ntb_bar idx);
 static int xeon_setup_b2b_mw(struct ntb_softc *,
     const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr);
-static int xeon_setup_msix_bar(struct ntb_softc *);
 static inline bool link_is_up(struct ntb_softc *ntb);
 static inline bool _xeon_link_is_up(struct ntb_softc *ntb);
 static inline bool atom_link_is_err(struct ntb_softc *ntb);
-static inline enum ntb_speed ntb_link_sta_speed(struct ntb_softc *);
-static inline enum ntb_width ntb_link_sta_width(struct ntb_softc *);
+static inline enum ntb_speed intel_ntb_link_sta_speed(struct ntb_softc *);
+static inline enum ntb_width intel_ntb_link_sta_width(struct ntb_softc *);
 static void atom_link_hb(void *arg);
-static void ntb_db_event(struct ntb_softc *ntb, uint32_t vec);
 static void recover_atom_link(void *arg);
-static bool ntb_poll_link(struct ntb_softc *ntb);
+static bool intel_ntb_poll_link(struct ntb_softc *ntb);
 static void save_bar_parameters(struct ntb_pci_bar_info *bar);
-static void ntb_sysctl_init(struct ntb_softc *);
+static void intel_ntb_sysctl_init(struct ntb_softc *);
 static int sysctl_handle_features(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS);
@@ -374,7 +381,7 @@ static unsigned g_ntb_hw_debug_level;
 TUNABLE_INT("hw.ntb.debug_level", &g_ntb_hw_debug_level);
 SYSCTL_UINT(_hw_ntb, OID_AUTO, debug_level, CTLFLAG_RWTUN,
     &g_ntb_hw_debug_level, 0, "ntb_hw log level -- higher is more verbose");
-#define ntb_printf(lvl, ...) do {				\
+#define intel_ntb_printf(lvl, ...) do {				\
 	if ((lvl) <= g_ntb_hw_debug_level) {			\
 		device_printf(ntb->device, __VA_ARGS__);	\
 	}							\
@@ -398,7 +405,7 @@ SYSCTL_UINT(_hw_ntb, OID_AUTO, default_mw_pat, CTLFLAG_RDTUN,
     "UC-: " __XSTRING(_NTB_PAT_UCM));
 
 static inline vm_memattr_t
-ntb_pat_flags(void)
+intel_ntb_pat_flags(void)
 {
 
 	switch (g_ntb_mw_pat) {
@@ -424,7 +431,7 @@ ntb_pat_flags(void)
  * anywhere better yet.
  */
 static inline const char *
-ntb_vm_memattr_to_str(vm_memattr_t pat)
+intel_ntb_vm_memattr_to_str(vm_memattr_t pat)
 {
 
 	switch (pat) {
@@ -445,7 +452,8 @@ ntb_vm_memattr_to_str(vm_memattr_t pat)
 	}
 }
 
-static int g_ntb_msix_idx = 0;
+static int g_ntb_msix_idx = 1;
+TUNABLE_INT("hw.ntb.msix_mw_idx", &g_ntb_msix_idx);
 SYSCTL_INT(_hw_ntb, OID_AUTO, msix_mw_idx, CTLFLAG_RDTUN, &g_ntb_msix_idx,
     0, "Use this memory window to access the peer MSIX message complex on "
     "certain Xeon-based NTB systems, as a workaround for a hardware errata.  "
@@ -461,6 +469,18 @@ SYSCTL_INT(_hw_ntb, OID_AUTO, b2b_mw_idx, CTLFLAG_RDTUN, &g_ntb_mw_idx,
     "available memory window.  Both sides of the NTB MUST set the same "
     "value here!  (Applies on Xeon platforms with SDOORBELL_LOCKUP errata.)");
 
+/* Hardware owns the low 16 bits of features. */
+#define NTB_BAR_SIZE_4K		(1 << 0)
+#define NTB_SDOORBELL_LOCKUP	(1 << 1)
+#define NTB_SB01BASE_LOCKUP	(1 << 2)
+#define NTB_B2BDOORBELL_BIT14	(1 << 3)
+/* Software/configuration owns the top 16 bits. */
+#define NTB_SPLIT_BAR		(1ull << 16)
+
+#define NTB_FEATURES_STR \
+    "\20\21SPLIT_BAR4\04B2B_DOORBELL_BIT14\03SB01BASE_LOCKUP" \
+    "\02SDOORBELL_LOCKUP\01BAR_SIZE_4K"
+
 static struct ntb_hw_info pci_ids[] = {
 	/* XXX: PS/SS IDs left out until they are supported. */
 	{ 0x0C4E8086, "BWD Atom Processor S1200 Non-Transparent Bridge B2B",
@@ -609,35 +629,15 @@ SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, dsd_bar5_addr32, CTLFLAG_RDTUN,
  */
 MALLOC_DEFINE(M_NTB, "ntb_hw", "ntb_hw driver memory allocations");
 
-static device_method_t ntb_pci_methods[] = {
-	/* Device interface */
-	DEVMETHOD(device_probe,     ntb_probe),
-	DEVMETHOD(device_attach,    ntb_attach),
-	DEVMETHOD(device_detach,    ntb_detach),
-	DEVMETHOD_END
-};
-
-static driver_t ntb_pci_driver = {
-	"ntb_hw",
-	ntb_pci_methods,
-	sizeof(struct ntb_softc),
-};
-
-static devclass_t ntb_devclass;
-DRIVER_MODULE(ntb_hw, pci, ntb_pci_driver, ntb_devclass, NULL, NULL);
-MODULE_VERSION(ntb_hw, 1);
-
-SYSCTL_NODE(_hw, OID_AUTO, ntb, CTLFLAG_RW, 0, "NTB sysctls");
-
 /*
  * OS <-> Driver linkage functions
  */
 static int
-ntb_probe(device_t device)
+intel_ntb_probe(device_t device)
 {
 	struct ntb_hw_info *p;
 
-	p = ntb_get_device_info(pci_get_devid(device));
+	p = intel_ntb_get_device_info(pci_get_devid(device));
 	if (p == NULL)
 		return (ENXIO);
 
@@ -646,14 +646,14 @@ ntb_probe(device_t device)
 }
 
 static int
-ntb_attach(device_t device)
+intel_ntb_attach(device_t device)
 {
 	struct ntb_softc *ntb;
 	struct ntb_hw_info *p;
 	int error;
 
-	ntb = DEVICE2SOFTC(device);
-	p = ntb_get_device_info(pci_get_devid(device));
+	ntb = device_get_softc(device);
+	p = intel_ntb_get_device_info(pci_get_devid(device));
 
 	ntb->device = device;
 	ntb->type = p->type;
@@ -666,47 +666,52 @@ ntb_attach(device_t device)
 	callout_init(&ntb->lr_timer, CALLOUT_MPSAFE);
 	callout_init(&ntb->peer_msix_work, 1);
 	mtx_init(&ntb->db_mask_lock, "ntb hw bits", NULL, MTX_SPIN);
-	mtx_init(&ntb->ctx_lock, "ntb ctx", NULL, MTX_DEF);
 
 	if (ntb->type == NTB_ATOM)
-		error = ntb_detect_atom(ntb);
+		error = intel_ntb_detect_atom(ntb);
 	else
-		error = ntb_detect_xeon(ntb);
+		error = intel_ntb_detect_xeon(ntb);
 	if (error != 0)
 		goto out;
 
-	ntb_detect_max_mw(ntb);
+	intel_ntb_detect_max_mw(ntb);
 
 	pci_enable_busmaster(ntb->device);
 
-	error = ntb_map_pci_bars(ntb);
+	error = intel_ntb_map_pci_bars(ntb);
 	if (error != 0)
 		goto out;
 	if (ntb->type == NTB_ATOM)
-		error = ntb_atom_init_dev(ntb);
+		error = intel_ntb_atom_init_dev(ntb);
 	else
-		error = ntb_xeon_init_dev(ntb);
+		error = intel_ntb_xeon_init_dev(ntb);
 	if (error != 0)
 		goto out;
 
-	ntb_spad_clear(ntb);
+	intel_ntb_spad_clear(device);
+
+	intel_ntb_poll_link(ntb);
 
-	ntb_poll_link(ntb);
+	intel_ntb_sysctl_init(ntb);
 
-	ntb_sysctl_init(ntb);
+	/* Attach children to this controller */
+	error = ntb_register_device(device);
 
 out:
 	if (error != 0)
-		ntb_detach(device);
+		intel_ntb_detach(device);
 	return (error);
 }
 
 static int
-ntb_detach(device_t device)
+intel_ntb_detach(device_t device)
 {
 	struct ntb_softc *ntb;
 
-	ntb = DEVICE2SOFTC(device);
+	ntb = device_get_softc(device);
+
+	/* Detach & delete all children */
+	ntb_unregister_device(device);
 
 	if (ntb->self_reg != NULL) {
 		DB_MASK_LOCK(ntb);
@@ -718,13 +723,12 @@ ntb_detach(device_t device)
 	callout_drain(&ntb->peer_msix_work);
 	pci_disable_busmaster(ntb->device);
 	if (ntb->type == NTB_XEON)
-		ntb_teardown_xeon(ntb);
-	ntb_teardown_interrupts(ntb);
+		intel_ntb_teardown_xeon(ntb);
+	intel_ntb_teardown_interrupts(ntb);
 
 	mtx_destroy(&ntb->db_mask_lock);
-	mtx_destroy(&ntb->ctx_lock);
 
-	ntb_unmap_pci_bar(ntb);
+	intel_ntb_unmap_pci_bar(ntb);
 
 	return (0);
 }
@@ -733,7 +737,7 @@ ntb_detach(device_t device)
  * Driver internal routines
  */
 static inline enum ntb_bar
-ntb_mw_to_bar(struct ntb_softc *ntb, unsigned mw)
+intel_ntb_mw_to_bar(struct ntb_softc *ntb, unsigned mw)
 {
 
 	KASSERT(mw < ntb->mw_count,
@@ -748,7 +752,7 @@ bar_is_64bit(struct ntb_softc *ntb, enum ntb_bar bar)
 {
 	/* XXX This assertion could be stronger. */
 	KASSERT(bar < NTB_MAX_BARS, ("bogus bar"));
-	return (bar < NTB_B2B_BAR_2 || !HAS_FEATURE(NTB_SPLIT_BAR));
+	return (bar < NTB_B2B_BAR_2 || !HAS_FEATURE(ntb, NTB_SPLIT_BAR));
 }
 
 static inline void
@@ -789,7 +793,7 @@ bar_get_xlat_params(struct ntb_softc *ntb, enum ntb_bar bar, uint32_t *base,
 }
 
 static int
-ntb_map_pci_bars(struct ntb_softc *ntb)
+intel_ntb_map_pci_bars(struct ntb_softc *ntb)
 {
 	int rc;
 
@@ -814,7 +818,7 @@ ntb_map_pci_bars(struct ntb_softc *ntb)
 	ntb->bar_info[NTB_B2B_BAR_2].ssz_off = XEON_SBAR4SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off = XEON_PBAR4XLAT_OFFSET;
 
-	if (!HAS_FEATURE(NTB_SPLIT_BAR))
+	if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		goto out;
 
 	ntb->bar_info[NTB_B2B_BAR_3].pci_resource_id = PCIR_BAR(5);
@@ -888,7 +892,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 	 * but the PCI driver does not honor the size in this call, so we have
 	 * to modify it after the fact.
 	 */
-	if (HAS_FEATURE(NTB_BAR_SIZE_4K)) {
+	if (HAS_FEATURE(ntb, NTB_BAR_SIZE_4K)) {
 		if (bar->pci_resource_id == PCIR_BAR(2))
 			bar_size_bits = pci_read_config(ntb->device,
 			    XEON_PBAR23SZ_OFFSET, 1);
@@ -915,7 +919,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 	 * Optionally, mark MW BARs as anything other than UC to improve
 	 * performance.
 	 */
-	mapmode = ntb_pat_flags();
+	mapmode = intel_ntb_pat_flags();
 	if (mapmode == bar->map_mode)
 		return (0);
 
@@ -928,7 +932,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 		    PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
 		    (char *)bar->vbase + bar->size - 1,
 		    (void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
-		    ntb_vm_memattr_to_str(mapmode));
+		    intel_ntb_vm_memattr_to_str(mapmode));
 	} else
 		device_printf(ntb->device,
 		    "Unable to mark BAR%d v:[%p-%p] p:[%p-%p] as "
@@ -936,13 +940,13 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 		    PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
 		    (char *)bar->vbase + bar->size - 1,
 		    (void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
-		    ntb_vm_memattr_to_str(mapmode), rc);
+		    intel_ntb_vm_memattr_to_str(mapmode), rc);
 		/* Proceed anyway */
 	return (0);
 }
 
 static void
-ntb_unmap_pci_bar(struct ntb_softc *ntb)
+intel_ntb_unmap_pci_bar(struct ntb_softc *ntb)
 {
 	struct ntb_pci_bar_info *current_bar;
 	int i;
@@ -957,7 +961,7 @@ ntb_unmap_pci_bar(struct ntb_softc *ntb)
 }
 
 static int
-ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors)
+intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors)
 {
 	uint32_t i;
 	int rc;
@@ -1012,7 +1016,7 @@ SYSCTL_INT(_hw_ntb, OID_AUTO, prefer_intx_to_remap, CTLFLAG_RDTUN,
  * round-robin fashion.
  */
 static int
-ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
+intel_ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
 {
 	u_int *vectors;
 	uint32_t i;
@@ -1032,7 +1036,7 @@ ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
 }
 
 static int
-ntb_init_isr(struct ntb_softc *ntb)
+intel_ntb_init_isr(struct ntb_softc *ntb)
 {
 	uint32_t desired_vectors, num_vectors;
 	int rc;
@@ -1058,7 +1062,7 @@ ntb_init_isr(struct ntb_softc *ntb)
 			num_vectors--;
 
 		if (rc == 0 && num_vectors < desired_vectors) {
-			rc = ntb_remap_msix(ntb->device, desired_vectors,
+			rc = intel_ntb_remap_msix(ntb->device, desired_vectors,
 			    num_vectors);
 			if (rc == 0)
 				num_vectors = desired_vectors;
@@ -1071,7 +1075,7 @@ ntb_init_isr(struct ntb_softc *ntb)
 		num_vectors = 1;
 
 	if (ntb->type == NTB_XEON && num_vectors < ntb->db_vec_count) {
-		if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			device_printf(ntb->device,
 			    "Errata workaround does not support MSI or INTX\n");
 			return (EINVAL);
@@ -1079,32 +1083,30 @@ ntb_init_isr(struct ntb_softc *ntb)
 
 		ntb->db_vec_count = 1;
 		ntb->db_vec_shift = XEON_DB_TOTAL_SHIFT;
-		rc = ntb_setup_legacy_interrupt(ntb);
+		rc = intel_ntb_setup_legacy_interrupt(ntb);
 	} else {
 		if (num_vectors - 1 != XEON_NONLINK_DB_MSIX_BITS &&
-		    HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		    HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			device_printf(ntb->device,
 			    "Errata workaround expects %d doorbell bits\n",
 			    XEON_NONLINK_DB_MSIX_BITS);
 			return (EINVAL);
 		}
 
-		ntb_create_msix_vec(ntb, num_vectors);
-		rc = ntb_setup_msix(ntb, num_vectors);
-		if (rc == 0 && HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-			ntb_get_msix_info(ntb);
+		intel_ntb_create_msix_vec(ntb, num_vectors);
+		rc = intel_ntb_setup_msix(ntb, num_vectors);
 	}
 	if (rc != 0) {
 		device_printf(ntb->device,
 		    "Error allocating interrupts: %d\n", rc);
-		ntb_free_msix_vec(ntb);
+		intel_ntb_free_msix_vec(ntb);
 	}
 
 	return (rc);
 }
 
 static int
-ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
+intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
 {
 	int rc;
 
@@ -1131,7 +1133,7 @@ ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
 }
 
 static void
-ntb_teardown_interrupts(struct ntb_softc *ntb)
+intel_ntb_teardown_interrupts(struct ntb_softc *ntb)
 {
 	struct ntb_int_info *current_int;
 	int i;
@@ -1147,7 +1149,7 @@ ntb_teardown_interrupts(struct ntb_softc *ntb)
 			    rman_get_rid(current_int->res), current_int->res);
 	}
 
-	ntb_free_msix_vec(ntb);
+	intel_ntb_free_msix_vec(ntb);
 	pci_release_msi(ntb->device);
 }
 
@@ -1160,11 +1162,11 @@ db_ioread(struct ntb_softc *ntb, uint64_t regoff)
 {
 
 	if (ntb->type == NTB_ATOM)
-		return (ntb_reg_read(8, regoff));
+		return (intel_ntb_reg_read(8, regoff));
 
 	KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
 
-	return (ntb_reg_read(2, regoff));
+	return (intel_ntb_reg_read(2, regoff));
 }
 
 static inline void
@@ -1186,89 +1188,78 @@ db_iowrite_raw(struct ntb_softc *ntb, uint64_t regoff, uint64_t val)
 {
 
 	if (ntb->type == NTB_ATOM) {
-		ntb_reg_write(8, regoff, val);
+		intel_ntb_reg_write(8, regoff, val);
 		return;
 	}
 
 	KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
-	ntb_reg_write(2, regoff, (uint16_t)val);
+	intel_ntb_reg_write(2, regoff, (uint16_t)val);
 }
 
-void
-ntb_db_set_mask(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_set_mask(device_t dev, uint64_t bits)
 {
-
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-		return;
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	DB_MASK_LOCK(ntb);
 	ntb->db_mask |= bits;
-	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	if (!HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
+		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	DB_MASK_UNLOCK(ntb);
 }
 
-void
-ntb_db_clear_mask(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_clear_mask(device_t dev, uint64_t bits)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
+	uint64_t ibits;
+	int i;
 
 	KASSERT((bits & ~ntb->db_valid_mask) == 0,
 	    ("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-		return;
-
 	DB_MASK_LOCK(ntb);
+	ibits = ntb->fake_db_bell & ntb->db_mask & bits;
 	ntb->db_mask &= ~bits;
-	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		/* Simulate fake interrupts if unmasked DB bits are set. */
+		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+			if ((ibits & intel_ntb_db_vector_mask(dev, i)) != 0)
+				swi_sched(ntb->int_info[i].tag, 0);
+		}
+	} else {
+		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	}
 	DB_MASK_UNLOCK(ntb);
 }
 
-uint64_t
-ntb_db_read(struct ntb_softc *ntb)
+static uint64_t
+intel_ntb_db_read(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
-		uint64_t res;
-		unsigned i;
-
-		res = 0;
-		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-			if (ntb->msix_vec[i].masked != 0)
-				res |= ntb_db_vector_mask(ntb, i);
-		}
-		return (res);
-	}
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
+		return (ntb->fake_db_bell);
 
 	return (db_ioread(ntb, ntb->self_reg->db_bell));
 }
 
-void
-ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_clear(device_t dev, uint64_t bits)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	KASSERT((bits & ~ntb->db_valid_mask) == 0,
 	    ("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
-		unsigned i;
-
-		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-			if ((bits & ntb_db_vector_mask(ntb, i)) != 0) {
-				DB_MASK_LOCK(ntb);
-				if (ntb->msix_vec[i].masked != 0) {
-					/* XXX These need a public API. */
-#if 0
-					pci_unmask_msix(ntb->device, i);
-#endif
-					ntb->msix_vec[i].masked = 0;
-				}
-				DB_MASK_UNLOCK(ntb);
-			}
-		}
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		DB_MASK_LOCK(ntb);
+		ntb->fake_db_bell &= ~bits;
+		DB_MASK_UNLOCK(ntb);
 		return;
 	}
 
@@ -1276,43 +1267,59 @@ ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
 }
 
 static inline uint64_t
-ntb_vec_mask(struct ntb_softc *ntb, uint64_t db_vector)
+intel_ntb_vec_mask(struct ntb_softc *ntb, uint64_t db_vector)
 {
 	uint64_t shift, mask;
 
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		/*
+		 * Remap vectors in custom way to make at least first
+		 * three doorbells to not generate stray events.
+		 * This breaks Linux compatibility (if one existed)
+		 * when more then one DB is used (not by if_ntb).
+		 */
+		if (db_vector < XEON_NONLINK_DB_MSIX_BITS - 1)
+			return (1 << db_vector);
+		if (db_vector == XEON_NONLINK_DB_MSIX_BITS - 1)
+			return (0x7ffc);
+	}
+
 	shift = ntb->db_vec_shift;
 	mask = (1ull << shift) - 1;
 	return (mask << (shift * db_vector));
 }
 
 static void
-ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
+intel_ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
 {
 	uint64_t vec_mask;
 
 	ntb->last_ts = ticks;
-	vec_mask = ntb_vec_mask(ntb, vec);
+	vec_mask = intel_ntb_vec_mask(ntb, vec);
 
 	if ((vec_mask & ntb->db_link_mask) != 0) {
-		if (ntb_poll_link(ntb))
-			ntb_link_event(ntb);
+		if (intel_ntb_poll_link(ntb))
+			ntb_link_event(ntb->device);
 	}
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
 	    (vec_mask & ntb->db_link_mask) == 0) {
 		DB_MASK_LOCK(ntb);
-		if (ntb->msix_vec[vec].masked == 0) {
-			/* XXX These need a public API. */
-#if 0
-			pci_mask_msix(ntb->device, vec);
-#endif
-			ntb->msix_vec[vec].masked = 1;
-		}
+
+		/* Do not report same DB events again if not cleared yet. */
+		vec_mask &= ~ntb->fake_db_bell;
+
+		/* Update our internal doorbell register. */
+		ntb->fake_db_bell |= vec_mask;
+
+		/* Do not report masked DB events. */
+		vec_mask &= ~ntb->db_mask;
+
 		DB_MASK_UNLOCK(ntb);
 	}
 
 	if ((vec_mask & ntb->db_valid_mask) != 0)
-		ntb_db_event(ntb, vec);
+		ntb_db_event(ntb->device, vec);
 }
 
 static void
@@ -1320,18 +1327,18 @@ ndev_vec_isr(void *arg)
 {
 	struct ntb_vec *nvec = arg;
 
-	ntb_interrupt(nvec->ntb, nvec->num);
+	intel_ntb_interrupt(nvec->ntb, nvec->num);
 }
 
 static void
 ndev_irq_isr(void *arg)
 {
 	/* If we couldn't set up MSI-X, we only have the one vector. */
-	ntb_interrupt(arg, 0);
+	intel_ntb_interrupt(arg, 0);
 }
 
 static int
-ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
+intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
 {
 	uint32_t i;
 
@@ -1346,7 +1353,7 @@ ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
 }
 
 static void
-ntb_free_msix_vec(struct ntb_softc *ntb)
+intel_ntb_free_msix_vec(struct ntb_softc *ntb)
 {
 
 	if (ntb->msix_vec == NULL)
@@ -1357,7 +1364,7 @@ ntb_free_msix_vec(struct ntb_softc *ntb)
 }
 
 static void
-ntb_get_msix_info(struct ntb_softc *ntb)
+intel_ntb_get_msix_info(struct ntb_softc *ntb)
 {
 	struct pci_devinfo *dinfo;
 	struct pcicfg_msix *msix;
@@ -1366,8 +1373,6 @@ ntb_get_msix_info(struct ntb_softc *ntb)
 	dinfo = device_get_ivars(ntb->device);
 	msix = &dinfo->cfg.msix;
 
-	laddr = data = 0;
-
 	CTASSERT(XEON_NONLINK_DB_MSIX_BITS == nitems(ntb->msix_data));
 
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
@@ -1375,7 +1380,7 @@ ntb_get_msix_info(struct ntb_softc *ntb)
 
 		laddr = bus_read_4(msix->msix_table_res, offset +
 		    PCI_MSIX_ENTRY_LOWER_ADDR);
-		ntb_printf(2, "local MSIX addr(%u): 0x%x\n", i, laddr);
+		intel_ntb_printf(2, "local MSIX addr(%u): 0x%x\n", i, laddr);
 
 		KASSERT((laddr & MSI_INTEL_ADDR_BASE) == MSI_INTEL_ADDR_BASE,
 		    ("local MSIX addr 0x%x not in MSI base 0x%x", laddr,
@@ -1384,14 +1389,14 @@ ntb_get_msix_info(struct ntb_softc *ntb)
 
 		data = bus_read_4(msix->msix_table_res, offset +
 		    PCI_MSIX_ENTRY_DATA);
-		ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
+		intel_ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
 
 		ntb->msix_data[i].nmd_data = data;
 	}
 }
 
 static struct ntb_hw_info *
-ntb_get_device_info(uint32_t device_id)
+intel_ntb_get_device_info(uint32_t device_id)
 {
 	struct ntb_hw_info *ep = pci_ids;
 
@@ -1404,15 +1409,15 @@ ntb_get_device_info(uint32_t device_id)
 }
 
 static void
-ntb_teardown_xeon(struct ntb_softc *ntb)
+intel_ntb_teardown_xeon(struct ntb_softc *ntb)
 {
 
 	if (ntb->reg != NULL)
-		ntb_link_disable(ntb);
+		intel_ntb_link_disable(ntb->device);
 }
 
 static void
-ntb_detect_max_mw(struct ntb_softc *ntb)
+intel_ntb_detect_max_mw(struct ntb_softc *ntb)
 {
 
 	if (ntb->type == NTB_ATOM) {
@@ -1420,14 +1425,14 @@ ntb_detect_max_mw(struct ntb_softc *ntb)
 		return;
 	}
 
-	if (HAS_FEATURE(NTB_SPLIT_BAR))
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		ntb->mw_count = XEON_HSX_SPLIT_MW_COUNT;
 	else
 		ntb->mw_count = XEON_SNB_MW_COUNT;
 }
 
 static int
-ntb_detect_xeon(struct ntb_softc *ntb)
+intel_ntb_detect_xeon(struct ntb_softc *ntb)
 {
 	uint8_t ppd, conn_type;
 
@@ -1442,11 +1447,21 @@ ntb_detect_xeon(struct ntb_softc *ntb)
 	if ((ppd & XEON_PPD_SPLIT_BAR) != 0)
 		ntb->features |= NTB_SPLIT_BAR;
 
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
+	    !HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
+		device_printf(ntb->device,
+		    "Can not apply SB01BASE_LOCKUP workaround "
+		    "with split BARs disabled!\n");
+		device_printf(ntb->device,
+		    "Expect system hangs under heavy NTB traffic!\n");
+		ntb->features &= ~NTB_SB01BASE_LOCKUP;
+	}
+
 	/*
 	 * SDOORBELL errata workaround gets in the way of SB01BASE_LOCKUP
 	 * errata workaround; only do one at a time.
 	 */
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
 		ntb->features &= ~NTB_SDOORBELL_LOCKUP;
 
 	conn_type = ppd & XEON_PPD_CONN_TYPE;
@@ -1465,7 +1480,7 @@ ntb_detect_xeon(struct ntb_softc *ntb)
 }
 
 static int
-ntb_detect_atom(struct ntb_softc *ntb)
+intel_ntb_detect_atom(struct ntb_softc *ntb)
 {
 	uint32_t ppd, conn_type;
 
@@ -1490,7 +1505,7 @@ ntb_detect_atom(struct ntb_softc *ntb)
 }
 
 static int
-ntb_xeon_init_dev(struct ntb_softc *ntb)
+intel_ntb_xeon_init_dev(struct ntb_softc *ntb)
 {
 	int rc;
 
@@ -1511,15 +1526,16 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 	ntb->peer_reg = &xeon_b2b_reg;
 	ntb->xlat_reg = &xeon_sec_xlat;
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		ntb->fake_db_bell = 0;
 		ntb->msix_mw_idx = (ntb->mw_count + g_ntb_msix_idx) %
 		    ntb->mw_count;
-		ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
+		intel_ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
 		    g_ntb_msix_idx, ntb->msix_mw_idx);
-		rc = ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
+		rc = intel_ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
 		    VM_MEMATTR_UNCACHEABLE);
 		KASSERT(rc == 0, ("shouldn't fail"));
-	} else if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+	} else if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
 		/*
 		 * There is a Xeon hardware errata related to writes to SDOORBELL or
 		 * B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
@@ -1529,12 +1545,12 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 		 */
 		ntb->b2b_mw_idx = (ntb->mw_count + g_ntb_mw_idx) %
 		    ntb->mw_count;
-		ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
+		intel_ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
 		    g_ntb_mw_idx, ntb->b2b_mw_idx);
-		rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
+		rc = intel_ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
 		    VM_MEMATTR_UNCACHEABLE);
 		KASSERT(rc == 0, ("shouldn't fail"));
-	} else if (HAS_FEATURE(NTB_B2BDOORBELL_BIT14))
+	} else if (HAS_FEATURE(ntb, NTB_B2BDOORBELL_BIT14))
 		/*
 		 * HW Errata on bit 14 of b2bdoorbell register.  Writes will not be
 		 * mirrored to the remote system.  Shrink the number of bits by one,
@@ -1557,7 +1573,7 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 		return (rc);
 
 	/* Enable Bus Master and Memory Space on the secondary side */
-	ntb_reg_write(2, XEON_SPCICMD_OFFSET,
+	intel_ntb_reg_write(2, XEON_SPCICMD_OFFSET,
 	    PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
 	/*
@@ -1568,16 +1584,12 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	DB_MASK_UNLOCK(ntb);
 
-	rc = xeon_setup_msix_bar(ntb);
-	if (rc != 0)
-		return (rc);
-
-	rc = ntb_init_isr(ntb);
+	rc = intel_ntb_init_isr(ntb);
 	return (rc);
 }
 
 static int
-ntb_atom_init_dev(struct ntb_softc *ntb)
+intel_ntb_atom_init_dev(struct ntb_softc *ntb)
 {
 	int error;
 
@@ -1604,15 +1616,15 @@ ntb_atom_init_dev(struct ntb_softc *ntb)
 	configure_atom_secondary_side_bars(ntb);
 
 	/* Enable Bus Master and Memory Space on the secondary side */
-	ntb_reg_write(2, ATOM_SPCICMD_OFFSET,
+	intel_ntb_reg_write(2, ATOM_SPCICMD_OFFSET,
 	    PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
-	error = ntb_init_isr(ntb);
+	error = intel_ntb_init_isr(ntb);
 	if (error != 0)
 		return (error);
 
 	/* Initiate PCI-E link training */
-	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
 
 	callout_reset(&ntb->heartbeat_timer, 0, atom_link_hb, ntb);
 
@@ -1625,19 +1637,19 @@ configure_atom_secondary_side_bars(struct ntb_softc *ntb)
 {
 
 	if (ntb->dev_type == NTB_DEV_USD) {
-		ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
 		    XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
 		    XEON_B2B_BAR4_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
 	} else {
-		ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
 		    XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
 		    XEON_B2B_BAR4_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
 	}
 }
 
@@ -1664,7 +1676,7 @@ xeon_reset_sbar_size(struct ntb_softc *ntb, enum ntb_bar idx,
 	struct ntb_pci_bar_info *bar;
 	uint8_t bar_sz;
 
-	if (!HAS_FEATURE(NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_3)
+	if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_3)
 		return;
 
 	bar = &ntb->bar_info[idx];
@@ -1688,28 +1700,28 @@ xeon_set_sbar_base_and_limit(struct ntb_softc *ntb, uint64_t bar_addr,
 	uint32_t base_reg, lmt_reg;
 
 	bar_get_xlat_params(ntb, idx, &base_reg, NULL, &lmt_reg);
-	if (idx == regbar)
-		bar_addr += ntb->b2b_off;
+	if (idx == regbar) {
+		if (ntb->b2b_off)
+			bar_addr += ntb->b2b_off;
+		else
+			bar_addr = 0;
+	}
 
-	/*
-	 * Set limit registers first to avoid an errata where setting the base
-	 * registers locks the limit registers.
-	 */
 	if (!bar_is_64bit(ntb, idx)) {
-		ntb_reg_write(4, lmt_reg, bar_addr);
-		reg_val = ntb_reg_read(4, lmt_reg);
+		intel_ntb_reg_write(4, base_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(4, base_reg);
 		(void)reg_val;
 
-		ntb_reg_write(4, base_reg, bar_addr);
-		reg_val = ntb_reg_read(4, base_reg);
+		intel_ntb_reg_write(4, lmt_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(4, lmt_reg);
 		(void)reg_val;
 	} else {
-		ntb_reg_write(8, lmt_reg, bar_addr);
-		reg_val = ntb_reg_read(8, lmt_reg);
+		intel_ntb_reg_write(8, base_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(8, base_reg);
 		(void)reg_val;
 
-		ntb_reg_write(8, base_reg, bar_addr);
-		reg_val = ntb_reg_read(8, base_reg);
+		intel_ntb_reg_write(8, lmt_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(8, lmt_reg);
 		(void)reg_val;
 	}
 }
@@ -1720,30 +1732,17 @@ xeon_set_pbar_xlat(struct ntb_softc *ntb, uint64_t base_addr, enum ntb_bar idx)
 	struct ntb_pci_bar_info *bar;
 
 	bar = &ntb->bar_info[idx];
-	if (HAS_FEATURE(NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_2) {
-		ntb_reg_write(4, bar->pbarxlat_off, base_addr);
-		base_addr = ntb_reg_read(4, bar->pbarxlat_off);
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_2) {
+		intel_ntb_reg_write(4, bar->pbarxlat_off, base_addr);
+		base_addr = intel_ntb_reg_read(4, bar->pbarxlat_off);
 	} else {
-		ntb_reg_write(8, bar->pbarxlat_off, base_addr);
-		base_addr = ntb_reg_read(8, bar->pbarxlat_off);
+		intel_ntb_reg_write(8, bar->pbarxlat_off, base_addr);
+		base_addr = intel_ntb_reg_read(8, bar->pbarxlat_off);
 	}
 	(void)base_addr;
 }
 
 static int
-xeon_setup_msix_bar(struct ntb_softc *ntb)
-{
-	enum ntb_bar bar_num;
-
-	if (!HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-		return (0);
-
-	bar_num = ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
-	ntb->peer_lapic_bar =  &ntb->bar_info[bar_num];
-	return (0);
-}
-
-static int
 xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
     const struct ntb_b2b_addr *peer_addr)
 {
@@ -1757,7 +1756,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		b2b_bar_num = NTB_CONFIG_BAR;
 		ntb->b2b_off = 0;
 	} else {
-		b2b_bar_num = ntb_mw_to_bar(ntb, ntb->b2b_mw_idx);
+		b2b_bar_num = intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx);
 		KASSERT(b2b_bar_num > 0 && b2b_bar_num < NTB_MAX_BARS,
 		    ("invalid b2b mw bar"));
 
@@ -1788,7 +1787,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		bar_addr = addr->bar0_addr;
 	else if (b2b_bar_num == NTB_B2B_BAR_1)
 		bar_addr = addr->bar2_addr64;
-	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(NTB_SPLIT_BAR))
+	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		bar_addr = addr->bar4_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2)
 		bar_addr = addr->bar4_addr32;
@@ -1797,7 +1796,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 	else
 		KASSERT(false, ("invalid bar"));
 
-	ntb_reg_write(8, XEON_SBAR0BASE_OFFSET, bar_addr);
+	intel_ntb_reg_write(8, XEON_SBAR0BASE_OFFSET, bar_addr);
 
 	/*
 	 * Other SBARs are normally hit by the PBAR xlat, except for the b2b
@@ -1808,7 +1807,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 	 */
 	xeon_set_sbar_base_and_limit(ntb, addr->bar2_addr64, NTB_B2B_BAR_1,
 	    b2b_bar_num);
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		xeon_set_sbar_base_and_limit(ntb, addr->bar4_addr32,
 		    NTB_B2B_BAR_2, b2b_bar_num);
 		xeon_set_sbar_base_and_limit(ntb, addr->bar5_addr32,
@@ -1818,56 +1817,41 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		    NTB_B2B_BAR_2, b2b_bar_num);
 
 	/* Zero incoming translation addrs */
-	ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
-	ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
-
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
-		size_t size, xlatoffset;
+	intel_ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
+	intel_ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
 
-		switch (ntb_mw_to_bar(ntb, ntb->msix_mw_idx)) {
-		case NTB_B2B_BAR_1:
-			size = 8;
-			xlatoffset = XEON_SBAR2XLAT_OFFSET;
-			break;
-		case NTB_B2B_BAR_2:
-			xlatoffset = XEON_SBAR4XLAT_OFFSET;
-			if (HAS_FEATURE(NTB_SPLIT_BAR))
-				size = 4;
-			else
-				size = 8;
-			break;
-		case NTB_B2B_BAR_3:
-			xlatoffset = XEON_SBAR5XLAT_OFFSET;
-			size = 4;
-			break;
-		default:
-			KASSERT(false, ("Bogus msix mw idx: %u",
-			    ntb->msix_mw_idx));
-			return (EINVAL);
-		}
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		uint32_t xlat_reg, lmt_reg;
+		enum ntb_bar bar_num;
 
 		/*
 		 * We point the chosen MSIX MW BAR xlat to remote LAPIC for
 		 * workaround
 		 */
-		if (size == 4) {
-			ntb_reg_write(4, xlatoffset, MSI_INTEL_ADDR_BASE);
-			ntb->msix_xlat = ntb_reg_read(4, xlatoffset);
+		bar_num = intel_ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
+		bar_get_xlat_params(ntb, bar_num, NULL, &xlat_reg, &lmt_reg);
+		if (bar_is_64bit(ntb, bar_num)) {
+			intel_ntb_reg_write(8, xlat_reg, MSI_INTEL_ADDR_BASE);
+			ntb->msix_xlat = intel_ntb_reg_read(8, xlat_reg);
+			intel_ntb_reg_write(8, lmt_reg, 0);
 		} else {
-			ntb_reg_write(8, xlatoffset, MSI_INTEL_ADDR_BASE);
-			ntb->msix_xlat = ntb_reg_read(8, xlatoffset);
+			intel_ntb_reg_write(4, xlat_reg, MSI_INTEL_ADDR_BASE);
+			ntb->msix_xlat = intel_ntb_reg_read(4, xlat_reg);
+			intel_ntb_reg_write(4, lmt_reg, 0);
 		}
+
+		ntb->peer_lapic_bar =  &ntb->bar_info[bar_num];
 	}
-	(void)ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
-	(void)ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
+	(void)intel_ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
+	(void)intel_ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
 
 	/* Zero outgoing translation limits (whole bar size windows) */
-	ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
-	ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
+	intel_ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
+	intel_ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
 
 	/* Set outgoing translation offsets */
 	xeon_set_pbar_xlat(ntb, peer_addr->bar2_addr64, NTB_B2B_BAR_1);
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		xeon_set_pbar_xlat(ntb, peer_addr->bar4_addr32, NTB_B2B_BAR_2);
 		xeon_set_pbar_xlat(ntb, peer_addr->bar5_addr32, NTB_B2B_BAR_3);
 	} else
@@ -1879,7 +1863,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		bar_addr = peer_addr->bar0_addr;
 	else if (b2b_bar_num == NTB_B2B_BAR_1)
 		bar_addr = peer_addr->bar2_addr64;
-	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(NTB_SPLIT_BAR))
+	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		bar_addr = peer_addr->bar4_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2)
 		bar_addr = peer_addr->bar4_addr32;
@@ -1892,8 +1876,8 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 	 * B2B_XLAT_OFFSET is a 64-bit register but can only be written 32 bits
 	 * at a time.
 	 */
-	ntb_reg_write(4, XEON_B2B_XLAT_OFFSETL, bar_addr & 0xffffffff);
-	ntb_reg_write(4, XEON_B2B_XLAT_OFFSETU, bar_addr >> 32);
+	intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETL, bar_addr & 0xffffffff);
+	intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETU, bar_addr >> 32);
 	return (0);
 }
 
@@ -1912,7 +1896,7 @@ link_is_up(struct ntb_softc *ntb)
 
 	if (ntb->type == NTB_XEON)
 		return (_xeon_link_is_up(ntb) && (ntb->peer_msix_good ||
-		    !HAS_FEATURE(NTB_SB01BASE_LOCKUP)));
+		    !HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)));
 
 	KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
 	return ((ntb->ntb_ctl & ATOM_CNTL_LINK_DOWN) == 0);
@@ -1925,11 +1909,11 @@ atom_link_is_err(struct ntb_softc *ntb)
 
 	KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
 
-	status = ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
 	if ((status & ATOM_LTSSMSTATEJMP_FORCEDETECT) != 0)
 		return (true);
 
-	status = ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
 	return ((status & ATOM_IBIST_ERR_OFLOW) != 0);
 }
 
@@ -1952,8 +1936,8 @@ atom_link_hb(void *arg)
 		goto out;
 	}
 
-	if (ntb_poll_link(ntb))
-		ntb_link_event(ntb);
+	if (intel_ntb_poll_link(ntb))
+		ntb_link_event(ntb->device);
 
 	if (!link_is_up(ntb) && atom_link_is_err(ntb)) {
 		/* Link is down with error, proceed with recovery */
@@ -1971,166 +1955,47 @@ atom_perform_link_restart(struct ntb_softc *ntb)
 	uint32_t status;
 
 	/* Driver resets the NTB ModPhy lanes - magic! */
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0xe0);
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x40);
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x60);
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0x60);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0xe0);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x40);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x60);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0x60);
 
 	/* Driver waits 100ms to allow the NTB ModPhy to settle */
 	pause("ModPhy", hz / 10);
 
 	/* Clear AER Errors, write to clear */
-	status = ntb_reg_read(4, ATOM_ERRCORSTS_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_ERRCORSTS_OFFSET);
 	status &= PCIM_AER_COR_REPLAY_ROLLOVER;
-	ntb_reg_write(4, ATOM_ERRCORSTS_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_ERRCORSTS_OFFSET, status);
 
 	/* Clear unexpected electrical idle event in LTSSM, write to clear */
-	status = ntb_reg_read(4, ATOM_LTSSMERRSTS0_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_LTSSMERRSTS0_OFFSET);
 	status |= ATOM_LTSSMERRSTS0_UNEXPECTEDEI;
-	ntb_reg_write(4, ATOM_LTSSMERRSTS0_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_LTSSMERRSTS0_OFFSET, status);
 
 	/* Clear DeSkew Buffer error, write to clear */
-	status = ntb_reg_read(4, ATOM_DESKEWSTS_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_DESKEWSTS_OFFSET);
 	status |= ATOM_DESKEWSTS_DBERR;
-	ntb_reg_write(4, ATOM_DESKEWSTS_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_DESKEWSTS_OFFSET, status);
 
-	status = ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
 	status &= ATOM_IBIST_ERR_OFLOW;
-	ntb_reg_write(4, ATOM_IBSTERRRCRVSTS0_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_IBSTERRRCRVSTS0_OFFSET, status);
 
 	/* Releases the NTB state machine to allow the link to retrain */
-	status = ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
 	status &= ~ATOM_LTSSMSTATEJMP_FORCEDETECT;
-	ntb_reg_write(4, ATOM_LTSSMSTATEJMP_OFFSET, status);
-}
-
-/*
- * ntb_set_ctx() - associate a driver context with an ntb device
- * @ntb:        NTB device context
- * @ctx:        Driver context
- * @ctx_ops:    Driver context operations
- *
- * Associate a driver context and operations with a ntb device.  The context is
- * provided by the client driver, and the driver may associate a different
- * context with each ntb device.
- *
- * Return: Zero if the context is associated, otherwise an error number.
- */
-int
-ntb_set_ctx(struct ntb_softc *ntb, void *ctx, const struct ntb_ctx_ops *ops)
-{
-
-	if (ctx == NULL || ops == NULL)
-		return (EINVAL);
-	if (ntb->ctx_ops != NULL)
-		return (EINVAL);
-
-	CTX_LOCK(ntb);
-	if (ntb->ctx_ops != NULL) {
-		CTX_UNLOCK(ntb);
-		return (EINVAL);
-	}
-	ntb->ntb_ctx = ctx;
-	ntb->ctx_ops = ops;
-	CTX_UNLOCK(ntb);
-
-	return (0);
-}
-
-/*
- * It is expected that this will only be used from contexts where the ctx_lock
- * is not needed to protect ntb_ctx lifetime.
- */
-void *
-ntb_get_ctx(struct ntb_softc *ntb, const struct ntb_ctx_ops **ops)
-{
-
-	KASSERT(ntb->ntb_ctx != NULL && ntb->ctx_ops != NULL, ("bogus"));
-	if (ops != NULL)
-		*ops = ntb->ctx_ops;
-	return (ntb->ntb_ctx);
-}
-
-/*
- * ntb_clear_ctx() - disassociate any driver context from an ntb device
- * @ntb:        NTB device context
- *
- * Clear any association that may exist between a driver context and the ntb
- * device.
- */
-void
-ntb_clear_ctx(struct ntb_softc *ntb)
-{
-
-	CTX_LOCK(ntb);
-	ntb->ntb_ctx = NULL;
-	ntb->ctx_ops = NULL;
-	CTX_UNLOCK(ntb);
-}
-
-/*
- * ntb_link_event() - notify driver context of a change in link status
- * @ntb:        NTB device context
- *
- * Notify the driver context that the link status may have changed.  The driver
- * should call ntb_link_is_up() to get the current status.
- */
-void
-ntb_link_event(struct ntb_softc *ntb)
-{
-
-	CTX_LOCK(ntb);
-	if (ntb->ctx_ops != NULL && ntb->ctx_ops->link_event != NULL)
-		ntb->ctx_ops->link_event(ntb->ntb_ctx);
-	CTX_UNLOCK(ntb);
+	intel_ntb_reg_write(4, ATOM_LTSSMSTATEJMP_OFFSET, status);
 }
 
-/*
- * ntb_db_event() - notify driver context of a doorbell event
- * @ntb:        NTB device context
- * @vector:     Interrupt vector number
- *
- * Notify the driver context of a doorbell event.  If hardware supports
- * multiple interrupt vectors for doorbells, the vector number indicates which
- * vector received the interrupt.  The vector number is relative to the first
- * vector used for doorbells, starting at zero, and must be less than
- * ntb_db_vector_count().  The driver may call ntb_db_read() to check which
- * doorbell bits need service, and ntb_db_vector_mask() to determine which of
- * those bits are associated with the vector number.
- */
-static void
-ntb_db_event(struct ntb_softc *ntb, uint32_t vec)
-{
-
-	CTX_LOCK(ntb);
-	if (ntb->ctx_ops != NULL && ntb->ctx_ops->db_event != NULL)
-		ntb->ctx_ops->db_event(ntb->ntb_ctx, vec);
-	CTX_UNLOCK(ntb);
-}
-
-/*
- * ntb_link_enable() - enable the link on the secondary side of the ntb
- * @ntb:        NTB device context
- * @max_speed:  The maximum link speed expressed as PCIe generation number[0]
- * @max_width:  The maximum link width expressed as the number of PCIe lanes[0]
- *
- * Enable the link on the secondary side of the ntb.  This can only be done
- * from the primary side of the ntb in primary or b2b topology.  The ntb device
- * should train the link to its maximum speed and width, or the requested speed
- * and width, whichever is smaller, if supported.
- *
- * Return: Zero on success, otherwise an error number.
- *
- * [0]: Only NTB_SPEED_AUTO and NTB_WIDTH_AUTO are valid inputs; other speed
- *      and width input will be ignored.
- */
-int
-ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
-    enum ntb_width w __unused)
+static int
+intel_ntb_link_enable(device_t dev, enum ntb_speed speed __unused,
+    enum ntb_width width __unused)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
-	ntb_printf(2, "%s\n", __func__);
+	intel_ntb_printf(2, "%s\n", __func__);
 
 	if (ntb->type == NTB_ATOM) {
 		pci_write_config(ntb->device, NTB_PPD_OFFSET,
@@ -2139,57 +2004,47 @@ ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
 	}
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ntb);
+		ntb_link_event(dev);
 		return (0);
 	}
 
-	cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
 	cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
 	cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
-	if (HAS_FEATURE(NTB_SPLIT_BAR))
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		cntl |= NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP;
-	ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
+	intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
 	return (0);
 }
 
-/*
- * ntb_link_disable() - disable the link on the secondary side of the ntb
- * @ntb:        NTB device context
- *
- * Disable the link on the secondary side of the ntb.  This can only be done
- * from the primary side of the ntb in primary or b2b topology.  The ntb device
- * should disable the link.  Returning from this call must indicate that a
- * barrier has passed, though with no more writes may pass in either direction
- * across the link, except if this call returns an error number.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_link_disable(struct ntb_softc *ntb)
+static int
+intel_ntb_link_disable(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
-	ntb_printf(2, "%s\n", __func__);
+	intel_ntb_printf(2, "%s\n", __func__);
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ntb);
+		ntb_link_event(dev);
 		return (0);
 	}
 
-	cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
 	cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
-	if (HAS_FEATURE(NTB_SPLIT_BAR))
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP);
 	cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
-	ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
+	intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
 	return (0);
 }
 
-bool
-ntb_link_enabled(struct ntb_softc *ntb)
+static bool
+intel_ntb_link_enabled(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
 	if (ntb->type == NTB_ATOM) {
@@ -2200,7 +2055,7 @@ ntb_link_enabled(struct ntb_softc *ntb)
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT)
 		return (true);
 
-	cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	return ((cntl & NTB_CNTL_LINK_DISABLE) == 0);
 }
 
@@ -2225,11 +2080,11 @@ recover_atom_link(void *arg)
 	if (atom_link_is_err(ntb))
 		goto retry;
 
-	status32 = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	status32 = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	if ((status32 & ATOM_CNTL_LINK_DOWN) != 0)
 		goto out;
 
-	status32 = ntb_reg_read(4, ntb->reg->lnk_sta);
+	status32 = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
 	width = NTB_LNK_STA_WIDTH(status32);
 	speed = status32 & NTB_LINK_SPEED_MASK;
 
@@ -2252,18 +2107,18 @@ retry:
  * Polls the HW link status register(s); returns true if something has changed.
  */
 static bool
-ntb_poll_link(struct ntb_softc *ntb)
+intel_ntb_poll_link(struct ntb_softc *ntb)
 {
 	uint32_t ntb_cntl;
 	uint16_t reg_val;
 
 	if (ntb->type == NTB_ATOM) {
-		ntb_cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+		ntb_cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 		if (ntb_cntl == ntb->ntb_ctl)
 			return (false);
 
 		ntb->ntb_ctl = ntb_cntl;
-		ntb->lnk_sta = ntb_reg_read(4, ntb->reg->lnk_sta);
+		ntb->lnk_sta = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
 	} else {
 		db_iowrite_raw(ntb, ntb->self_reg->db_bell, ntb->db_link_mask);
 
@@ -2273,11 +2128,11 @@ ntb_poll_link(struct ntb_softc *ntb)
 
 		ntb->lnk_sta = reg_val;
 
-		if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			if (_xeon_link_is_up(ntb)) {
 				if (!ntb->peer_msix_good) {
 					callout_reset(&ntb->peer_msix_work, 0,
-					    ntb_exchange_msix, ntb);
+					    intel_ntb_exchange_msix, ntb);
 					return (false);
 				}
 			} else {
@@ -2290,7 +2145,7 @@ ntb_poll_link(struct ntb_softc *ntb)
 }
 
 static inline enum ntb_speed
-ntb_link_sta_speed(struct ntb_softc *ntb)
+intel_ntb_link_sta_speed(struct ntb_softc *ntb)
 {
 
 	if (!link_is_up(ntb))
@@ -2299,7 +2154,7 @@ ntb_link_sta_speed(struct ntb_softc *ntb)
 }
 
 static inline enum ntb_width
-ntb_link_sta_width(struct ntb_softc *ntb)
+intel_ntb_link_sta_width(struct ntb_softc *ntb)
 {
 
 	if (!link_is_up(ntb))
@@ -2321,7 +2176,7 @@ SYSCTL_NODE(_hw_ntb, OID_AUTO, debug_info, CTLFLAG_RW, 0,
 #define NTB_REGFLAGS_MASK	(NTB_REGSZ_MASK | NTB_DB_READ | NTB_PCI_REG)
 
 static void
-ntb_sysctl_init(struct ntb_softc *ntb)
+intel_ntb_sysctl_init(struct ntb_softc *ntb)
 {
 	struct sysctl_oid_list *globals, *tree_par, *regpar, *statpar, *errpar;
 	struct sysctl_ctx_list *ctx;
@@ -2424,7 +2279,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_xlat,
 	    sysctl_handle_register, "QU", "Incoming XLAT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_xlat4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_xlat,
@@ -2444,7 +2299,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_limit,
 	    sysctl_handle_register, "QU", "Incoming LMT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_lmt4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_limit,
@@ -2535,7 +2390,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->bar_info[NTB_B2B_BAR_1].pbarxlat_off,
 	    sysctl_handle_register, "QU", "Outgoing XLAT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_xlat4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off,
@@ -2555,7 +2410,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | XEON_PBAR2LMT_OFFSET,
 	    sysctl_handle_register, "QU", "Outgoing LMT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_lmt4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | XEON_PBAR4LMT_OFFSET,
@@ -2579,7 +2434,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_base,
 	    sysctl_handle_register, "QU", "Secondary BAR23 base register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar4_base",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_base,
@@ -2602,13 +2457,10 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 static int
 sysctl_handle_features(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	struct sbuf sb;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
 	sbuf_new_for_sysctl(&sb, NULL, 256, req);
 
 	sbuf_printf(&sb, "%b", ntb->features, NTB_FEATURES_STR);
@@ -2623,14 +2475,11 @@ sysctl_handle_features(SYSCTL_HANDLER_ARGS)
 static int
 sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	unsigned old, new;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
-	old = ntb_link_enabled(ntb);
+	old = intel_ntb_link_enabled(ntb->device);
 
 	error = SYSCTL_OUT(req, &old, sizeof(old));
 	if (error != 0 || req->newptr == NULL)
@@ -2640,31 +2489,28 @@ sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
 	if (error != 0)
 		return (error);
 
-	ntb_printf(0, "Admin set interface state to '%sabled'\n",
+	intel_ntb_printf(0, "Admin set interface state to '%sabled'\n",
 	    (new != 0)? "en" : "dis");
 
 	if (new != 0)
-		error = ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+		error = intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
 	else
-		error = ntb_link_disable(ntb);
+		error = intel_ntb_link_disable(ntb->device);
 	return (error);
 }
 
 static int
 sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	struct sbuf sb;
 	enum ntb_speed speed;
 	enum ntb_width width;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
 	sbuf_new_for_sysctl(&sb, NULL, 32, req);
 
-	if (ntb_link_is_up(ntb, &speed, &width))
+	if (intel_ntb_link_is_up(ntb->device, &speed, &width))
 		sbuf_printf(&sb, "up / PCIe Gen %u / Width x%u",
 		    (unsigned)speed, (unsigned)width);
 	else
@@ -2681,14 +2527,11 @@ sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
 static int
 sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	unsigned res;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
-	res = ntb_link_is_up(ntb, NULL, NULL);
+	res = intel_ntb_link_is_up(ntb->device, NULL, NULL);
 
 	error = SYSCTL_OUT(req, &res, sizeof(res));
 	if (error || !req->newptr)
@@ -2727,28 +2570,28 @@ sysctl_handle_register(SYSCTL_HANDLER_ARGS)
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 8);
 			else
-				umv = ntb_reg_read(8, reg);
+				umv = intel_ntb_reg_read(8, reg);
 			outsz = sizeof(uint64_t);
 			break;
 		case NTB_REG_32:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 4);
 			else
-				umv = ntb_reg_read(4, reg);
+				umv = intel_ntb_reg_read(4, reg);
 			outsz = sizeof(uint32_t);
 			break;
 		case NTB_REG_16:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 2);
 			else
-				umv = ntb_reg_read(2, reg);
+				umv = intel_ntb_reg_read(2, reg);
 			outsz = sizeof(uint16_t);
 			break;
 		case NTB_REG_8:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 1);
 			else
-				umv = ntb_reg_read(1, reg);
+				umv = intel_ntb_reg_read(1, reg);
 			outsz = sizeof(uint8_t);
 			break;
 		default:
@@ -2768,7 +2611,7 @@ sysctl_handle_register(SYSCTL_HANDLER_ARGS)
 }
 
 static unsigned
-ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
+intel_ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
 {
 
 	if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
@@ -2782,8 +2625,21 @@ ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
 	return (uidx);
 }
 
+#ifndef EARLY_AP_STARTUP
+static int msix_ready;
+
+static void
+intel_ntb_msix_ready(void *arg __unused)
+{
+
+	msix_ready = 1;
+}
+SYSINIT(intel_ntb_msix_ready, SI_SUB_SMP, SI_ORDER_ANY,
+    intel_ntb_msix_ready, NULL);
+#endif
+
 static void
-ntb_exchange_msix(void *ctx)
+intel_ntb_exchange_msix(void *ctx)
 {
 	struct ntb_softc *ntb;
 	uint32_t val;
@@ -2796,42 +2652,50 @@ ntb_exchange_msix(void *ctx)
 	if (ntb->peer_msix_done)
 		goto msix_done;
 
+#ifndef EARLY_AP_STARTUP
+	/* Block MSIX negotiation until SMP started and IRQ reshuffled. */
+	if (!msix_ready)
+		goto reschedule;
+#endif
+
+	intel_ntb_get_msix_info(ntb);
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-		ntb_peer_spad_write(ntb, NTB_MSIX_DATA0 + i,
+		intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DATA0 + i,
 		    ntb->msix_data[i].nmd_data);
-		ntb_peer_spad_write(ntb, NTB_MSIX_OFS0 + i,
+		intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_OFS0 + i,
 		    ntb->msix_data[i].nmd_ofs - ntb->msix_xlat);
 	}
-	ntb_peer_spad_write(ntb, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
+	intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
 
-	ntb_spad_read(ntb, NTB_MSIX_GUARD, &val);
+	intel_ntb_spad_read(ntb->device, NTB_MSIX_GUARD, &val);
 	if (val != NTB_MSIX_VER_GUARD)
 		goto reschedule;
 
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-		ntb_spad_read(ntb, NTB_MSIX_DATA0 + i, &val);
-		ntb_printf(2, "remote MSIX data(%u): 0x%x\n", i, val);
+		intel_ntb_spad_read(ntb->device, NTB_MSIX_DATA0 + i, &val);
+		intel_ntb_printf(2, "remote MSIX data(%u): 0x%x\n", i, val);
 		ntb->peer_msix_data[i].nmd_data = val;
-		ntb_spad_read(ntb, NTB_MSIX_OFS0 + i, &val);
-		ntb_printf(2, "remote MSIX addr(%u): 0x%x\n", i, val);
+		intel_ntb_spad_read(ntb->device, NTB_MSIX_OFS0 + i, &val);
+		intel_ntb_printf(2, "remote MSIX addr(%u): 0x%x\n", i, val);
 		ntb->peer_msix_data[i].nmd_ofs = val;
 	}
 
 	ntb->peer_msix_done = true;
 
 msix_done:
-	ntb_peer_spad_write(ntb, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
-	ntb_spad_read(ntb, NTB_MSIX_DONE, &val);
+	intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
+	intel_ntb_spad_read(ntb->device, NTB_MSIX_DONE, &val);
 	if (val != NTB_MSIX_RECEIVED)
 		goto reschedule;
 
+	intel_ntb_spad_clear(ntb->device);
 	ntb->peer_msix_good = true;
 	/* Give peer time to see our NTB_MSIX_RECEIVED. */
 	goto reschedule;
 
 msix_good:
-	ntb_poll_link(ntb);
-	ntb_link_event(ntb);
+	intel_ntb_poll_link(ntb);
+	ntb_link_event(ntb->device);
 	return;
 
 reschedule:
@@ -2839,40 +2703,27 @@ reschedule:
 	if (_xeon_link_is_up(ntb)) {
 		callout_reset(&ntb->peer_msix_work,
 		    hz * (ntb->peer_msix_good ? 2 : 1) / 100,
-		    ntb_exchange_msix, ntb);
+		    intel_ntb_exchange_msix, ntb);
 	} else
-		ntb_spad_clear(ntb);
+		intel_ntb_spad_clear(ntb->device);
 }
 
 /*
  * Public API to the rest of the OS
  */
 
-/**
- * ntb_get_max_spads() - get the total scratch regs usable
- * @ntb: pointer to ntb_softc instance
- *
- * This function returns the max 32bit scratchpad registers usable by the
- * upper layer.
- *
- * RETURNS: total number of scratch pad registers available
- */
-uint8_t
-ntb_get_max_spads(struct ntb_softc *ntb)
+static uint8_t
+intel_ntb_spad_count(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	return (ntb->spad_count);
 }
 
-/*
- * ntb_mw_count() - Get the number of memory windows available for KPI
- * consumers.
- *
- * (Excludes any MW wholly reserved for register access.)
- */
-uint8_t
-ntb_mw_count(struct ntb_softc *ntb)
+static uint8_t
+intel_ntb_mw_count(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint8_t res;
 
 	res = ntb->mw_count;
@@ -2883,25 +2734,15 @@ ntb_mw_count(struct ntb_softc *ntb)
 	return (res);
 }
 
-/**
- * ntb_spad_write() - write to the secondary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
+static int
+intel_ntb_spad_write(device_t dev, unsigned int idx, uint32_t val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	ntb_reg_write(4, ntb->self_reg->spad + idx * 4, val);
+	intel_ntb_reg_write(4, ntb->self_reg->spad + idx * 4, val);
 
 	return (0);
 }
@@ -2909,122 +2750,77 @@ ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
 /*
  * Zeros the local scratchpad.
  */
-void
-ntb_spad_clear(struct ntb_softc *ntb)
+static void
+intel_ntb_spad_clear(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	unsigned i;
 
 	for (i = 0; i < ntb->spad_count; i++)
-		ntb_spad_write(ntb, i, 0);
+		intel_ntb_spad_write(dev, i, 0);
 }
 
-/**
- * ntb_spad_read() - read from the primary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val)
+static int
+intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	*val = ntb_reg_read(4, ntb->self_reg->spad + idx * 4);
+	*val = intel_ntb_reg_read(4, ntb->self_reg->spad + idx * 4);
 
 	return (0);
 }
 
-/**
- * ntb_peer_spad_write() - write to the secondary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
+static int
+intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP))
-		ntb_mw_write(4, XEON_SPAD_OFFSET + idx * 4, val);
+	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
+		intel_ntb_mw_write(4, XEON_SPAD_OFFSET + idx * 4, val);
 	else
-		ntb_reg_write(4, ntb->peer_reg->spad + idx * 4, val);
+		intel_ntb_reg_write(4, ntb->peer_reg->spad + idx * 4, val);
 
 	return (0);
 }
 
-/**
- * ntb_peer_spad_read() - read from the primary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_peer_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val)
+static int
+intel_ntb_peer_spad_read(device_t dev, unsigned int idx, uint32_t *val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP))
-		*val = ntb_mw_read(4, XEON_SPAD_OFFSET + idx * 4);
+	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
+		*val = intel_ntb_mw_read(4, XEON_SPAD_OFFSET + idx * 4);
 	else
-		*val = ntb_reg_read(4, ntb->peer_reg->spad + idx * 4);
+		*val = intel_ntb_reg_read(4, ntb->peer_reg->spad + idx * 4);
 
 	return (0);
 }
 
-/*
- * ntb_mw_get_range() - get the range of a memory window
- * @ntb:        NTB device context
- * @idx:        Memory window number
- * @base:       OUT - the base address for mapping the memory window
- * @size:       OUT - the size for mapping the memory window
- * @align:      OUT - the base alignment for translating the memory window
- * @align_size: OUT - the size alignment for translating the memory window
- *
- * Get the range of a memory window.  NULL may be given for any output
- * parameter if the value is not needed.  The base and size may be used for
- * mapping the memory window, to access the peer memory.  The alignment and
- * size may be used for translating the memory window, for the peer to access
- * memory on the local system.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_get_range(struct ntb_softc *ntb, unsigned mw_idx, vm_paddr_t *base,
+static int
+intel_ntb_mw_get_range(device_t dev, unsigned mw_idx, vm_paddr_t *base,
     caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
     bus_addr_t *plimit)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	bus_addr_t limit;
 	size_t bar_b2b_off;
 	enum ntb_bar bar_num;
 
-	if (mw_idx >= ntb_mw_count(ntb))
+	if (mw_idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
-	mw_idx = ntb_user_mw_to_idx(ntb, mw_idx);
+	mw_idx = intel_ntb_user_mw_to_idx(ntb, mw_idx);
 
-	bar_num = ntb_mw_to_bar(ntb, mw_idx);
+	bar_num = intel_ntb_mw_to_bar(ntb, mw_idx);
 	bar = &ntb->bar_info[bar_num];
 	bar_b2b_off = 0;
 	if (mw_idx == ntb->b2b_mw_idx) {
@@ -3053,37 +2849,21 @@ ntb_mw_get_range(struct ntb_softc *ntb, unsigned mw_idx, vm_paddr_t *base,
 	return (0);
 }
 
-/*
- * ntb_mw_set_trans() - set the translation of a memory window
- * @ntb:        NTB device context
- * @idx:        Memory window number
- * @addr:       The dma address local memory to expose to the peer
- * @size:       The size of the local memory to expose to the peer
- *
- * Set the translation of a memory window.  The peer may access local memory
- * through the window starting at the address, up to the size.  The address
- * must be aligned to the alignment specified by ntb_mw_get_range().  The size
- * must be aligned to the size alignment specified by ntb_mw_get_range().  The
- * address must be below the plimit specified by ntb_mw_get_range() (i.e. for
- * 32-bit BARs).
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
-    size_t size)
+static int
+intel_ntb_mw_set_trans(device_t dev, unsigned idx, bus_addr_t addr, size_t size)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	uint64_t base, limit, reg_val;
 	size_t bar_size, mw_size;
 	uint32_t base_reg, xlat_reg, limit_reg;
 	enum ntb_bar bar_num;
 
-	if (idx >= ntb_mw_count(ntb))
+	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
-	idx = ntb_user_mw_to_idx(ntb, idx);
+	idx = intel_ntb_user_mw_to_idx(ntb, idx);
 
-	bar_num = ntb_mw_to_bar(ntb, idx);
+	bar_num = intel_ntb_mw_to_bar(ntb, idx);
 	bar = &ntb->bar_info[bar_num];
 
 	bar_size = bar->size;
@@ -3103,25 +2883,25 @@ ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
 
 	limit = 0;
 	if (bar_is_64bit(ntb, bar_num)) {
-		base = ntb_reg_read(8, base_reg) & BAR_HIGH_MASK;
+		base = intel_ntb_reg_read(8, base_reg) & BAR_HIGH_MASK;
 
 		if (limit_reg != 0 && size != mw_size)
 			limit = base + size;
 
 		/* Set and verify translation address */
-		ntb_reg_write(8, xlat_reg, addr);
-		reg_val = ntb_reg_read(8, xlat_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(8, xlat_reg, addr);
+		reg_val = intel_ntb_reg_read(8, xlat_reg) & BAR_HIGH_MASK;
 		if (reg_val != addr) {
-			ntb_reg_write(8, xlat_reg, 0);
+			intel_ntb_reg_write(8, xlat_reg, 0);
 			return (EIO);
 		}
 
 		/* Set and verify the limit */
-		ntb_reg_write(8, limit_reg, limit);
-		reg_val = ntb_reg_read(8, limit_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(8, limit_reg, limit);
+		reg_val = intel_ntb_reg_read(8, limit_reg) & BAR_HIGH_MASK;
 		if (reg_val != limit) {
-			ntb_reg_write(8, limit_reg, base);
-			ntb_reg_write(8, xlat_reg, 0);
+			intel_ntb_reg_write(8, limit_reg, base);
+			intel_ntb_reg_write(8, xlat_reg, 0);
 			return (EIO);
 		}
 	} else {
@@ -3132,98 +2912,72 @@ ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
 		if (((addr + size) & UINT32_MAX) != (addr + size))
 			return (ERANGE);
 
-		base = ntb_reg_read(4, base_reg) & BAR_HIGH_MASK;
+		base = intel_ntb_reg_read(4, base_reg) & BAR_HIGH_MASK;
 
 		if (limit_reg != 0 && size != mw_size)
 			limit = base + size;
 
 		/* Set and verify translation address */
-		ntb_reg_write(4, xlat_reg, addr);
-		reg_val = ntb_reg_read(4, xlat_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(4, xlat_reg, addr);
+		reg_val = intel_ntb_reg_read(4, xlat_reg) & BAR_HIGH_MASK;
 		if (reg_val != addr) {
-			ntb_reg_write(4, xlat_reg, 0);
+			intel_ntb_reg_write(4, xlat_reg, 0);
 			return (EIO);
 		}
 
 		/* Set and verify the limit */
-		ntb_reg_write(4, limit_reg, limit);
-		reg_val = ntb_reg_read(4, limit_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(4, limit_reg, limit);
+		reg_val = intel_ntb_reg_read(4, limit_reg) & BAR_HIGH_MASK;
 		if (reg_val != limit) {
-			ntb_reg_write(4, limit_reg, base);
-			ntb_reg_write(4, xlat_reg, 0);
+			intel_ntb_reg_write(4, limit_reg, base);
+			intel_ntb_reg_write(4, xlat_reg, 0);
 			return (EIO);
 		}
 	}
 	return (0);
 }
 
-/*
- * ntb_mw_clear_trans() - clear the translation of a memory window
- * @ntb:	NTB device context
- * @idx:	Memory window number
- *
- * Clear the translation of a memory window.  The peer may no longer access
- * local memory through the window.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_clear_trans(struct ntb_softc *ntb, unsigned mw_idx)
+static int
+intel_ntb_mw_clear_trans(device_t dev, unsigned mw_idx)
 {
 
-	return (ntb_mw_set_trans(ntb, mw_idx, 0, 0));
+	return (intel_ntb_mw_set_trans(dev, mw_idx, 0, 0));
 }
 
-/*
- * ntb_mw_get_wc - Get the write-combine status of a memory window
- *
- * Returns:  Zero on success, setting *wc; otherwise an error number (e.g. if
- * idx is an invalid memory window).
- *
- * Mode is a VM_MEMATTR_* type.
- */
-int
-ntb_mw_get_wc(struct ntb_softc *ntb, unsigned idx, vm_memattr_t *mode)
+static int
+intel_ntb_mw_get_wc(device_t dev, unsigned idx, vm_memattr_t *mode)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 
-	if (idx >= ntb_mw_count(ntb))
+	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
-	idx = ntb_user_mw_to_idx(ntb, idx);
+	idx = intel_ntb_user_mw_to_idx(ntb, idx);
 
-	bar = &ntb->bar_info[ntb_mw_to_bar(ntb, idx)];
+	bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
 	*mode = bar->map_mode;
 	return (0);
 }
 
-/*
- * ntb_mw_set_wc - Set the write-combine status of a memory window
- *
- * If 'mode' matches the current status, this does nothing and succeeds.  Mode
- * is a VM_MEMATTR_* type.
- *
- * Returns:  Zero on success, setting the caching attribute on the virtual
- * mapping of the BAR; otherwise an error number (e.g. if idx is an invalid
- * memory window, or if changing the caching attribute fails).
- */
-int
-ntb_mw_set_wc(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
+static int
+intel_ntb_mw_set_wc(device_t dev, unsigned idx, vm_memattr_t mode)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
-	if (idx >= ntb_mw_count(ntb))
+	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
 
-	idx = ntb_user_mw_to_idx(ntb, idx);
-	return (ntb_mw_set_wc_internal(ntb, idx, mode));
+	idx = intel_ntb_user_mw_to_idx(ntb, idx);
+	return (intel_ntb_mw_set_wc_internal(ntb, idx, mode));
 }
 
 static int
-ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
+intel_ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
 {
 	struct ntb_pci_bar_info *bar;
 	int rc;
 
-	bar = &ntb->bar_info[ntb_mw_to_bar(ntb, idx)];
+	bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
 	if (bar->map_mode == mode)
 		return (0);
 
@@ -3234,26 +2988,19 @@ ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
 	return (rc);
 }
 
-/**
- * ntb_peer_db_set() - Set the doorbell on the secondary/external side
- * @ntb: pointer to ntb_softc instance
- * @bit: doorbell bits to ring
- *
- * This function allows triggering of a doorbell on the secondary/external
- * side that will initiate an interrupt on the remote host
- */
-void
-ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
+static void
+intel_ntb_peer_db_set(device_t dev, uint64_t bit)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		struct ntb_pci_bar_info *lapic;
 		unsigned i;
 
 		lapic = ntb->peer_lapic_bar;
 
 		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-			if ((bit & ntb_db_vector_mask(ntb, i)) != 0)
+			if ((bit & intel_ntb_db_vector_mask(dev, i)) != 0)
 				bus_space_write_4(lapic->pci_bus_tag,
 				    lapic->pci_bus_handle,
 				    ntb->peer_msix_data[i].nmd_ofs,
@@ -3262,99 +3009,76 @@ ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
 		return;
 	}
 
-	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
-		ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
+	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
+		intel_ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
 		return;
 	}
 
 	db_iowrite(ntb, ntb->peer_reg->db_bell, bit);
 }
 
-/*
- * ntb_get_peer_db_addr() - Return the address of the remote doorbell register,
- * as well as the size of the register (via *sz_out).
- *
- * This function allows a caller using I/OAT DMA to chain the remote doorbell
- * ring to its memory window write.
- *
- * Note that writing the peer doorbell via a memory window will *not* generate
- * an interrupt on the remote host; that must be done seperately.
- */
-bus_addr_t
-ntb_get_peer_db_addr(struct ntb_softc *ntb, vm_size_t *sz_out)
+static int
+intel_ntb_peer_db_addr(device_t dev, bus_addr_t *db_addr, vm_size_t *db_size)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	uint64_t regoff;
 
-	KASSERT(sz_out != NULL, ("must be non-NULL"));
+	KASSERT((db_addr != NULL && db_size != NULL), ("must be non-NULL"));
 
-	if (!HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+	if (!HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
 		bar = &ntb->bar_info[NTB_CONFIG_BAR];
 		regoff = ntb->peer_reg->db_bell;
 	} else {
 		KASSERT(ntb->b2b_mw_idx != B2B_MW_DISABLED,
 		    ("invalid b2b idx"));
 
-		bar = &ntb->bar_info[ntb_mw_to_bar(ntb, ntb->b2b_mw_idx)];
+		bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx)];
 		regoff = XEON_PDOORBELL_OFFSET;
 	}
 	KASSERT(bar->pci_bus_tag != X86_BUS_SPACE_IO, ("uh oh"));
 
-	*sz_out = ntb->reg->db_size;
 	/* HACK: Specific to current x86 bus implementation. */
-	return ((uint64_t)bar->pci_bus_handle + regoff);
+	*db_addr = ((uint64_t)bar->pci_bus_handle + regoff);
+	*db_size = ntb->reg->db_size;
+	return (0);
 }
 
-/*
- * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
- * @ntb:	NTB device context
- *
- * Hardware may support different number or arrangement of doorbell bits.
- *
- * Return: A mask of doorbell bits supported by the ntb.
- */
-uint64_t
-ntb_db_valid_mask(struct ntb_softc *ntb)
+static uint64_t
+intel_ntb_db_valid_mask(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	return (ntb->db_valid_mask);
 }
 
-/*
- * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
- * @ntb:	NTB device context
- * @vector:	Doorbell vector number
- *
- * Each interrupt vector may have a different number or arrangement of bits.
- *
- * Return: A mask of doorbell bits serviced by a vector.
- */
-uint64_t
-ntb_db_vector_mask(struct ntb_softc *ntb, uint32_t vector)
+static int
+intel_ntb_db_vector_count(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
+
+	return (ntb->db_vec_count);
+}
+
+static uint64_t
+intel_ntb_db_vector_mask(device_t dev, uint32_t vector)
+{
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (vector > ntb->db_vec_count)
 		return (0);
-	return (ntb->db_valid_mask & ntb_vec_mask(ntb, vector));
+	return (ntb->db_valid_mask & intel_ntb_vec_mask(ntb, vector));
 }
 
-/**
- * ntb_link_is_up() - get the current ntb link state
- * @ntb:        NTB device context
- * @speed:      OUT - The link speed expressed as PCIe generation number
- * @width:      OUT - The link width expressed as the number of PCIe lanes
- *
- * RETURNS: true or false based on the hardware link state
- */
-bool
-ntb_link_is_up(struct ntb_softc *ntb, enum ntb_speed *speed,
-    enum ntb_width *width)
+static bool
+intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed, enum ntb_width *width)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (speed != NULL)
-		*speed = ntb_link_sta_speed(ntb);
+		*speed = intel_ntb_link_sta_speed(ntb);
 	if (width != NULL)
-		*width = ntb_link_sta_width(ntb);
+		*width = intel_ntb_link_sta_width(ntb);
 	return (link_is_up(ntb));
 }
 
@@ -3369,17 +3093,42 @@ save_bar_parameters(struct ntb_pci_bar_info *bar)
 	bar->vbase = rman_get_virtual(bar->pci_resource);
 }
 
-device_t
-ntb_get_device(struct ntb_softc *ntb)
-{
-
-	return (ntb->device);
-}
-
-/* Export HW-specific errata information. */
-bool
-ntb_has_feature(struct ntb_softc *ntb, uint32_t feature)
-{
+static device_method_t ntb_intel_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		intel_ntb_probe),
+	DEVMETHOD(device_attach,	intel_ntb_attach),
+	DEVMETHOD(device_detach,	intel_ntb_detach),
+	/* NTB interface */
+	DEVMETHOD(ntb_link_is_up,	intel_ntb_link_is_up),
+	DEVMETHOD(ntb_link_enable,	intel_ntb_link_enable),
+	DEVMETHOD(ntb_link_disable,	intel_ntb_link_disable),
+	DEVMETHOD(ntb_link_enabled,	intel_ntb_link_enabled),
+	DEVMETHOD(ntb_mw_count,		intel_ntb_mw_count),
+	DEVMETHOD(ntb_mw_get_range,	intel_ntb_mw_get_range),
+	DEVMETHOD(ntb_mw_set_trans,	intel_ntb_mw_set_trans),
+	DEVMETHOD(ntb_mw_clear_trans,	intel_ntb_mw_clear_trans),
+	DEVMETHOD(ntb_mw_get_wc,	intel_ntb_mw_get_wc),
+	DEVMETHOD(ntb_mw_set_wc,	intel_ntb_mw_set_wc),
+	DEVMETHOD(ntb_spad_count,	intel_ntb_spad_count),
+	DEVMETHOD(ntb_spad_clear,	intel_ntb_spad_clear),
+	DEVMETHOD(ntb_spad_write,	intel_ntb_spad_write),
+	DEVMETHOD(ntb_spad_read,	intel_ntb_spad_read),
+	DEVMETHOD(ntb_peer_spad_write,	intel_ntb_peer_spad_write),
+	DEVMETHOD(ntb_peer_spad_read,	intel_ntb_peer_spad_read),
+	DEVMETHOD(ntb_db_valid_mask,	intel_ntb_db_valid_mask),
+	DEVMETHOD(ntb_db_vector_count,	intel_ntb_db_vector_count),
+	DEVMETHOD(ntb_db_vector_mask,	intel_ntb_db_vector_mask),
+	DEVMETHOD(ntb_db_clear,		intel_ntb_db_clear),
+	DEVMETHOD(ntb_db_clear_mask,	intel_ntb_db_clear_mask),
+	DEVMETHOD(ntb_db_read,		intel_ntb_db_read),
+	DEVMETHOD(ntb_db_set_mask,	intel_ntb_db_set_mask),
+	DEVMETHOD(ntb_peer_db_addr,	intel_ntb_peer_db_addr),
+	DEVMETHOD(ntb_peer_db_set,	intel_ntb_peer_db_set),
+	DEVMETHOD_END
+};
 
-	return (HAS_FEATURE(feature));
-}
+static DEFINE_CLASS_0(ntb_hw, ntb_intel_driver, ntb_intel_methods,
+    sizeof(struct ntb_softc));
+DRIVER_MODULE(ntb_intel, pci, ntb_intel_driver, ntb_hw_devclass, NULL, NULL);
+MODULE_DEPEND(ntb_intel, ntb, 1, 1, 1);
+MODULE_VERSION(ntb_intel, 1);
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.h b/sys/dev/ntb/ntb_hw/ntb_hw.h
deleted file mode 100644
index f05acda..0000000
--- a/sys/dev/ntb/ntb_hw/ntb_hw.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*-
- * Copyright (C) 2013 Intel Corporation
- * Copyright (C) 2015 EMC Corporation
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _NTB_HW_H_
-#define _NTB_HW_H_
-
-struct ntb_softc;
-
-#define NTB_MAX_NUM_MW	3
-
-enum ntb_speed {
-	NTB_SPEED_AUTO = -1,
-	NTB_SPEED_NONE = 0,
-	NTB_SPEED_GEN1 = 1,
-	NTB_SPEED_GEN2 = 2,
-	NTB_SPEED_GEN3 = 3,
-};
-
-enum ntb_width {
-	NTB_WIDTH_AUTO = -1,
-	NTB_WIDTH_NONE = 0,
-	NTB_WIDTH_1 = 1,
-	NTB_WIDTH_2 = 2,
-	NTB_WIDTH_4 = 4,
-	NTB_WIDTH_8 = 8,
-	NTB_WIDTH_12 = 12,
-	NTB_WIDTH_16 = 16,
-	NTB_WIDTH_32 = 32,
-};
-
-SYSCTL_DECL(_hw_ntb);
-
-typedef void (*ntb_db_callback)(void *data, uint32_t vector);
-typedef void (*ntb_event_callback)(void *data);
-
-struct ntb_ctx_ops {
-	ntb_event_callback	link_event;
-	ntb_db_callback		db_event;
-};
-
-device_t ntb_get_device(struct ntb_softc *);
-
-bool ntb_link_is_up(struct ntb_softc *, enum ntb_speed *, enum ntb_width *);
-void ntb_link_event(struct ntb_softc *);
-int ntb_link_enable(struct ntb_softc *, enum ntb_speed, enum ntb_width);
-int ntb_link_disable(struct ntb_softc *);
-bool ntb_link_enabled(struct ntb_softc *);
-
-int ntb_set_ctx(struct ntb_softc *, void *, const struct ntb_ctx_ops *);
-void *ntb_get_ctx(struct ntb_softc *, const struct ntb_ctx_ops **);
-void ntb_clear_ctx(struct ntb_softc *);
-
-uint8_t ntb_mw_count(struct ntb_softc *);
-int ntb_mw_get_range(struct ntb_softc *, unsigned mw_idx, vm_paddr_t *base,
-    caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
-    bus_addr_t *plimit);
-int ntb_mw_set_trans(struct ntb_softc *, unsigned mw_idx, bus_addr_t, size_t);
-int ntb_mw_clear_trans(struct ntb_softc *, unsigned mw_idx);
-
-int ntb_mw_get_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t *mode);
-int ntb_mw_set_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t mode);
-
-uint8_t ntb_get_max_spads(struct ntb_softc *ntb);
-void ntb_spad_clear(struct ntb_softc *ntb);
-int ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val);
-int ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val);
-int ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx,
-    uint32_t val);
-int ntb_peer_spad_read(struct ntb_softc *ntb, unsigned int idx,
-    uint32_t *val);
-
-uint64_t ntb_db_valid_mask(struct ntb_softc *);
-uint64_t ntb_db_vector_mask(struct ntb_softc *, uint32_t vector);
-bus_addr_t ntb_get_peer_db_addr(struct ntb_softc *, vm_size_t *sz_out);
-
-void ntb_db_clear(struct ntb_softc *, uint64_t bits);
-void ntb_db_clear_mask(struct ntb_softc *, uint64_t bits);
-uint64_t ntb_db_read(struct ntb_softc *);
-void ntb_db_set_mask(struct ntb_softc *, uint64_t bits);
-void ntb_peer_db_set(struct ntb_softc *, uint64_t bits);
-
-#define XEON_SPAD_COUNT		16
-#define ATOM_SPAD_COUNT		16
-
-/* Hardware owns the low 16 bits of features. */
-#define NTB_BAR_SIZE_4K		(1 << 0)
-#define NTB_SDOORBELL_LOCKUP	(1 << 1)
-#define NTB_SB01BASE_LOCKUP	(1 << 2)
-#define NTB_B2BDOORBELL_BIT14	(1 << 3)
-/* Software/configuration owns the top 16 bits. */
-#define NTB_SPLIT_BAR		(1ull << 16)
-
-#define NTB_FEATURES_STR \
-    "\20\21SPLIT_BAR4\04B2B_DOORBELL_BIT14\03SB01BASE_LOCKUP" \
-    "\02SDOORBELL_LOCKUP\01BAR_SIZE_4K"
-
-bool ntb_has_feature(struct ntb_softc *, uint32_t);
-
-#endif /* _NTB_HW_H_ */
diff --git a/sys/dev/ntb/ntb_hw/ntb_regs.h b/sys/dev/ntb/ntb_hw/ntb_regs.h
index fb445d7..a037736 100644
--- a/sys/dev/ntb/ntb_hw/ntb_regs.h
+++ b/sys/dev/ntb/ntb_hw/ntb_regs.h
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
@@ -76,6 +77,7 @@
 #define XEON_SDBMSK_OFFSET	0x0066
 #define XEON_USMEMMISS_OFFSET	0x0070
 #define XEON_SPAD_OFFSET	0x0080
+#define XEON_SPAD_COUNT		16
 #define XEON_SPADSEMA4_OFFSET	0x00c0
 #define XEON_WCCNTRL_OFFSET	0x00e0
 #define XEON_UNCERRSTS_OFFSET	0x014c
@@ -104,6 +106,7 @@
 #define ATOM_NTBCNTL_OFFSET	0x0060
 #define ATOM_EBDF_OFFSET		0x0064
 #define ATOM_SPAD_OFFSET		0x0080
+#define ATOM_SPAD_COUNT		16
 #define ATOM_SPADSEMA_OFFSET	0x00c0
 #define ATOM_STKYSPAD_OFFSET	0x00c4
 #define ATOM_PBAR2XLAT_OFFSET	0x8008
diff --git a/sys/dev/ntb/ntb_if.m b/sys/dev/ntb/ntb_if.m
new file mode 100644
index 0000000..d8ca227
--- /dev/null
+++ b/sys/dev/ntb/ntb_if.m
@@ -0,0 +1,210 @@
+#-
+# Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+INTERFACE ntb;
+
+HEADER {
+	enum ntb_speed {
+		NTB_SPEED_AUTO = -1,
+		NTB_SPEED_NONE = 0,
+		NTB_SPEED_GEN1 = 1,
+		NTB_SPEED_GEN2 = 2,
+		NTB_SPEED_GEN3 = 3,
+	};
+
+	enum ntb_width {
+		NTB_WIDTH_AUTO = -1,
+		NTB_WIDTH_NONE = 0,
+		NTB_WIDTH_1 = 1,
+		NTB_WIDTH_2 = 2,
+		NTB_WIDTH_4 = 4,
+		NTB_WIDTH_8 = 8,
+		NTB_WIDTH_12 = 12,
+		NTB_WIDTH_16 = 16,
+		NTB_WIDTH_32 = 32,
+	};
+
+	typedef void (*ntb_db_callback)(void *data, uint32_t vector);
+	typedef void (*ntb_event_callback)(void *data);
+	struct ntb_ctx_ops {
+		ntb_event_callback	link_event;
+		ntb_db_callback		db_event;
+	};
+};
+
+METHOD bool link_is_up {
+	device_t	 ntb;
+	enum ntb_speed	*speed;
+	enum ntb_width	*width;
+};
+
+METHOD int link_enable {
+	device_t	 ntb;
+	enum ntb_speed	 speed;
+	enum ntb_width	 width;
+};
+
+METHOD int link_disable {
+	device_t	 ntb;
+};
+
+METHOD bool link_enabled {
+	device_t	 ntb;
+};
+
+METHOD int set_ctx {
+	device_t	 ntb;
+	void		*ctx;
+	const struct ntb_ctx_ops *ctx_ops;
+};
+
+METHOD void * get_ctx {
+	device_t	 ntb;
+	const struct ntb_ctx_ops **ctx_ops;
+};
+
+METHOD void clear_ctx {
+	device_t	 ntb;
+};
+
+METHOD uint8_t mw_count {
+	device_t	 ntb;
+};
+
+METHOD int mw_get_range {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	vm_paddr_t	*base;
+	caddr_t		*vbase;
+	size_t		*size;
+	size_t		*align;
+	size_t		*align_size;
+	bus_addr_t	*plimit;
+};
+
+METHOD int mw_set_trans {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	bus_addr_t	 addr;
+	size_t		 size;
+};
+
+METHOD int mw_clear_trans {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+};
+
+METHOD int mw_get_wc {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	vm_memattr_t	*mode;
+};
+
+METHOD int mw_set_wc {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	vm_memattr_t	 mode;
+};
+
+METHOD uint8_t spad_count {
+	device_t	 ntb;
+};
+
+METHOD void spad_clear {
+	device_t	 ntb;
+};
+
+METHOD int spad_write {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	 val;
+};
+
+METHOD int spad_read {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	 *val;
+};
+
+METHOD int peer_spad_write {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	 val;
+};
+
+METHOD int peer_spad_read {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	*val;
+};
+
+METHOD uint64_t db_valid_mask {
+	device_t	 ntb;
+};
+
+METHOD int db_vector_count {
+	device_t	 ntb;
+};
+
+METHOD uint64_t db_vector_mask {
+	device_t	 ntb;
+	uint32_t	 vector;
+};
+
+METHOD int peer_db_addr {
+	device_t	 ntb;
+	bus_addr_t	*db_addr;
+	vm_size_t	*db_size;
+};
+
+METHOD void db_clear {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
+
+METHOD void db_clear_mask {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
+
+METHOD uint64_t db_read {
+	device_t	 ntb;
+};
+
+METHOD void db_set_mask {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
+
+METHOD void peer_db_set {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
diff --git a/sys/dev/ntb/ntb_transport.c b/sys/dev/ntb/ntb_transport.c
new file mode 100644
index 0000000..5297db9
--- /dev/null
+++ b/sys/dev/ntb/ntb_transport.c
@@ -0,0 +1,1521 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * Copyright (C) 2013 Intel Corporation
+ * Copyright (C) 2015 EMC Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a transport for sending and receiving messages by
+ * writing to remote memory window(s) provided by underlying NTB device.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+
+#include "ntb.h"
+#include "ntb_transport.h"
+
+#define KTR_NTB KTR_SPARE3
+
+#define NTB_TRANSPORT_VERSION	4
+
+static SYSCTL_NODE(_hw, OID_AUTO, ntb_transport, CTLFLAG_RW, 0, "ntb_transport");
+
+static unsigned g_ntb_transport_debug_level;
+TUNABLE_INT("hw.ntb_transport.debug_level", &g_ntb_transport_debug_level);
+SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, debug_level, CTLFLAG_RWTUN,
+    &g_ntb_transport_debug_level, 0,
+    "ntb_transport log level -- higher is more verbose");
+#define ntb_printf(lvl, ...) do {			\
+	if ((lvl) <= g_ntb_transport_debug_level) {	\
+		printf(__VA_ARGS__);			\
+	}						\
+} while (0)
+
+static unsigned transport_mtu = 0x10000;
+
+static uint64_t max_mw_size;
+TUNABLE_QUAD("hw.ntb_transport.max_mw_size", &max_mw_size);
+SYSCTL_UQUAD(_hw_ntb_transport, OID_AUTO, max_mw_size, CTLFLAG_RDTUN, &max_mw_size, 0,
+    "If enabled (non-zero), limit the size of large memory windows. "
+    "Both sides of the NTB MUST set the same value here.");
+
+static unsigned enable_xeon_watchdog;
+TUNABLE_INT("hw.ntb_transport.enable_xeon_watchdog", &enable_xeon_watchdog);
+SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, enable_xeon_watchdog, CTLFLAG_RDTUN,
+    &enable_xeon_watchdog, 0, "If non-zero, write a register every second to "
+    "keep a watchdog from tearing down the NTB link");
+
+STAILQ_HEAD(ntb_queue_list, ntb_queue_entry);
+
+typedef uint32_t ntb_q_idx_t;
+
+struct ntb_queue_entry {
+	/* ntb_queue list reference */
+	STAILQ_ENTRY(ntb_queue_entry) entry;
+
+	/* info on data to be transferred */
+	void		*cb_data;
+	void		*buf;
+	uint32_t	len;
+	uint32_t	flags;
+
+	struct ntb_transport_qp		*qp;
+	struct ntb_payload_header	*x_hdr;
+	ntb_q_idx_t	index;
+};
+
+struct ntb_rx_info {
+	ntb_q_idx_t	entry;
+};
+
+struct ntb_transport_qp {
+	struct ntb_transport_ctx	*transport;
+	device_t		 dev;
+
+	void			*cb_data;
+
+	bool			client_ready;
+	volatile bool		link_is_up;
+	uint8_t			qp_num;	/* Only 64 QPs are allowed.  0-63 */
+
+	struct ntb_rx_info	*rx_info;
+	struct ntb_rx_info	*remote_rx_info;
+
+	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	struct ntb_queue_list	tx_free_q;
+	struct mtx		ntb_tx_free_q_lock;
+	caddr_t			tx_mw;
+	bus_addr_t		tx_mw_phys;
+	ntb_q_idx_t		tx_index;
+	ntb_q_idx_t		tx_max_entry;
+	uint64_t		tx_max_frame;
+
+	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	struct ntb_queue_list	rx_post_q;
+	struct ntb_queue_list	rx_pend_q;
+	/* ntb_rx_q_lock: synchronize access to rx_XXXX_q */
+	struct mtx		ntb_rx_q_lock;
+	struct task		rxc_db_work;
+	struct taskqueue	*rxc_tq;
+	caddr_t			rx_buff;
+	ntb_q_idx_t		rx_index;
+	ntb_q_idx_t		rx_max_entry;
+	uint64_t		rx_max_frame;
+
+	void (*event_handler)(void *data, enum ntb_link_event status);
+	struct callout		link_work;
+	struct callout		rx_full;
+
+	uint64_t		last_rx_no_buf;
+
+	/* Stats */
+	uint64_t		rx_bytes;
+	uint64_t		rx_pkts;
+	uint64_t		rx_ring_empty;
+	uint64_t		rx_err_no_buf;
+	uint64_t		rx_err_oflow;
+	uint64_t		rx_err_ver;
+	uint64_t		tx_bytes;
+	uint64_t		tx_pkts;
+	uint64_t		tx_ring_full;
+	uint64_t		tx_err_no_buf;
+
+	struct mtx		tx_lock;
+};
+
+struct ntb_transport_mw {
+	vm_paddr_t	phys_addr;
+	size_t		phys_size;
+	size_t		xlat_align;
+	size_t		xlat_align_size;
+	bus_addr_t	addr_limit;
+	/* Tx buff is off vbase / phys_addr */
+	caddr_t		vbase;
+	size_t		xlat_size;
+	size_t		buff_size;
+	/* Rx buff is off virt_addr / dma_addr */
+	caddr_t		virt_addr;
+	bus_addr_t	dma_addr;
+};
+
+struct ntb_transport_child {
+	device_t	dev;
+	int		qpoff;
+	int		qpcnt;
+	struct ntb_transport_child *next;
+};
+
+struct ntb_transport_ctx {
+	device_t		 dev;
+	struct ntb_transport_child *child;
+	struct ntb_transport_mw	*mw_vec;
+	struct ntb_transport_qp	*qp_vec;
+	unsigned		mw_count;
+	unsigned		qp_count;
+	uint64_t		qp_bitmap;
+	volatile bool		link_is_up;
+	struct callout		link_work;
+	struct callout		link_watchdog;
+	struct task		link_cleanup;
+};
+
+enum {
+	NTBT_DESC_DONE_FLAG = 1 << 0,
+	NTBT_LINK_DOWN_FLAG = 1 << 1,
+};
+
+struct ntb_payload_header {
+	ntb_q_idx_t ver;
+	uint32_t len;
+	uint32_t flags;
+};
+
+enum {
+	/*
+	 * The order of this enum is part of the remote protocol.  Do not
+	 * reorder without bumping protocol version (and it's probably best
+	 * to keep the protocol in lock-step with the Linux NTB driver.
+	 */
+	NTBT_VERSION = 0,
+	NTBT_QP_LINKS,
+	NTBT_NUM_QPS,
+	NTBT_NUM_MWS,
+	/*
+	 * N.B.: transport_link_work assumes MW1 enums = MW0 + 2.
+	 */
+	NTBT_MW0_SZ_HIGH,
+	NTBT_MW0_SZ_LOW,
+	NTBT_MW1_SZ_HIGH,
+	NTBT_MW1_SZ_LOW,
+
+	/*
+	 * Some NTB-using hardware have a watchdog to work around NTB hangs; if
+	 * a register or doorbell isn't written every few seconds, the link is
+	 * torn down.  Write an otherwise unused register every few seconds to
+	 * work around this watchdog.
+	 */
+	NTBT_WATCHDOG_SPAD = 15
+};
+
+#define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
+#define NTB_QP_DEF_NUM_ENTRIES	100
+#define NTB_LINK_DOWN_TIMEOUT	10
+
+static int ntb_transport_probe(device_t dev);
+static int ntb_transport_attach(device_t dev);
+static int ntb_transport_detach(device_t dev);
+static void ntb_transport_init_queue(struct ntb_transport_ctx *nt,
+    unsigned int qp_num);
+static int ntb_process_tx(struct ntb_transport_qp *qp,
+    struct ntb_queue_entry *entry);
+static void ntb_transport_rxc_db(void *arg, int pending);
+static int ntb_process_rxc(struct ntb_transport_qp *qp);
+static void ntb_memcpy_rx(struct ntb_transport_qp *qp,
+    struct ntb_queue_entry *entry, void *offset);
+static inline void ntb_rx_copy_callback(struct ntb_transport_qp *qp,
+    void *data);
+static void ntb_complete_rxc(struct ntb_transport_qp *qp);
+static void ntb_transport_doorbell_callback(void *data, uint32_t vector);
+static void ntb_transport_event_callback(void *data);
+static void ntb_transport_link_work(void *arg);
+static int ntb_set_mw(struct ntb_transport_ctx *, int num_mw, size_t size);
+static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw);
+static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
+    unsigned int qp_num);
+static void ntb_qp_link_work(void *arg);
+static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt);
+static void ntb_transport_link_cleanup_work(void *, int);
+static void ntb_qp_link_down(struct ntb_transport_qp *qp);
+static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp);
+static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp);
+static void ntb_send_link_down(struct ntb_transport_qp *qp);
+static void ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
+    struct ntb_queue_list *list);
+static struct ntb_queue_entry *ntb_list_rm(struct mtx *lock,
+    struct ntb_queue_list *list);
+static struct ntb_queue_entry *ntb_list_mv(struct mtx *lock,
+    struct ntb_queue_list *from, struct ntb_queue_list *to);
+static void xeon_link_watchdog_hb(void *);
+
+static const struct ntb_ctx_ops ntb_transport_ops = {
+	.link_event = ntb_transport_event_callback,
+	.db_event = ntb_transport_doorbell_callback,
+};
+
+MALLOC_DEFINE(M_NTB_T, "ntb_transport", "ntb transport driver");
+
+static inline void
+iowrite32(uint32_t val, void *addr)
+{
+
+	bus_space_write_4(X86_BUS_SPACE_MEM, 0/* HACK */, (uintptr_t)addr,
+	    val);
+}
+
+/* Transport Init and teardown */
+
+static void
+xeon_link_watchdog_hb(void *arg)
+{
+	struct ntb_transport_ctx *nt;
+
+	nt = arg;
+	ntb_spad_write(nt->dev, NTBT_WATCHDOG_SPAD, 0);
+	callout_reset(&nt->link_watchdog, 1 * hz, xeon_link_watchdog_hb, nt);
+}
+
+static int
+ntb_transport_probe(device_t dev)
+{
+
+	device_set_desc(dev, "NTB Transport");
+	return (0);
+}
+
+static int
+ntb_transport_attach(device_t dev)
+{
+	struct ntb_transport_ctx *nt = device_get_softc(dev);
+	struct ntb_transport_child **cpp = &nt->child;
+	struct ntb_transport_child *nc;
+	struct ntb_transport_mw *mw;
+	uint64_t db_bitmap;
+	int rc, i, db_count, spad_count, qp, qpu, qpo, qpt;
+	char cfg[128] = "";
+	char buf[32];
+	char *n, *np, *c, *name;
+
+	nt->dev = dev;
+	nt->mw_count = ntb_mw_count(dev);
+	spad_count = ntb_spad_count(dev);
+	db_bitmap = ntb_db_valid_mask(dev);
+	db_count = flsll(db_bitmap);
+	KASSERT(db_bitmap == (1 << db_count) - 1,
+	    ("Doorbells are not sequential (%jx).\n", db_bitmap));
+
+	device_printf(dev, "%d memory windows, %d scratchpads, "
+	    "%d doorbells\n", nt->mw_count, spad_count, db_count);
+
+	if (nt->mw_count == 0) {
+		device_printf(dev, "At least 1 memory window required.\n");
+		return (ENXIO);
+	}
+	if (spad_count < 6) {
+		device_printf(dev, "At least 6 scratchpads required.\n");
+		return (ENXIO);
+	}
+	if (spad_count < 4 + 2 * nt->mw_count) {
+		nt->mw_count = (spad_count - 4) / 2;
+		device_printf(dev, "Scratchpads enough only for %d "
+		    "memory windows.\n", nt->mw_count);
+	}
+	if (db_bitmap == 0) {
+		device_printf(dev, "At least one doorbell required.\n");
+		return (ENXIO);
+	}
+
+	nt->mw_vec = malloc(nt->mw_count * sizeof(*nt->mw_vec), M_NTB_T,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < nt->mw_count; i++) {
+		mw = &nt->mw_vec[i];
+
+		rc = ntb_mw_get_range(dev, i, &mw->phys_addr, &mw->vbase,
+		    &mw->phys_size, &mw->xlat_align, &mw->xlat_align_size,
+		    &mw->addr_limit);
+		if (rc != 0)
+			goto err;
+
+		mw->buff_size = 0;
+		mw->xlat_size = 0;
+		mw->virt_addr = NULL;
+		mw->dma_addr = 0;
+
+		rc = ntb_mw_set_wc(dev, i, VM_MEMATTR_WRITE_COMBINING);
+		if (rc)
+			ntb_printf(0, "Unable to set mw%d caching\n", i);
+	}
+
+	qpu = 0;
+	qpo = imin(db_count, nt->mw_count);
+	qpt = db_count;
+
+	snprintf(buf, sizeof(buf), "hint.%s.%d.config", device_get_name(dev),
+	    device_get_unit(dev));
+	TUNABLE_STR_FETCH(buf, cfg, sizeof(cfg));
+	n = cfg;
+	i = 0;
+	while ((c = strsep(&n, ",")) != NULL) {
+		np = c;
+		name = strsep(&np, ":");
+		if (name != NULL && name[0] == 0)
+			name = NULL;
+		qp = (np && np[0] != 0) ? strtol(np, NULL, 10) : qpo - qpu;
+		if (qp <= 0)
+			qp = 1;
+
+		if (qp > qpt - qpu) {
+			device_printf(dev, "Not enough resources for config\n");
+			break;
+		}
+
+		nc = malloc(sizeof(*nc), M_DEVBUF, M_WAITOK | M_ZERO);
+		nc->qpoff = qpu;
+		nc->qpcnt = qp;
+		nc->dev = device_add_child(dev, name, -1);
+		if (nc->dev == NULL) {
+			device_printf(dev, "Can not add child.\n");
+			break;
+		}
+		device_set_ivars(nc->dev, nc);
+		*cpp = nc;
+		cpp = &nc->next;
+
+		if (bootverbose) {
+			device_printf(dev, "%d \"%s\": queues %d",
+			    i, name, qpu);
+			if (qp > 1)
+				printf("-%d", qpu + qp - 1);
+			printf("\n");
+		}
+
+		qpu += qp;
+		i++;
+	}
+	nt->qp_count = qpu;
+
+	nt->qp_vec = malloc(nt->qp_count * sizeof(*nt->qp_vec), M_NTB_T,
+	    M_WAITOK | M_ZERO);
+
+	for (i = 0; i < nt->qp_count; i++)
+		ntb_transport_init_queue(nt, i);
+
+	callout_init(&nt->link_work, 0);
+	callout_init(&nt->link_watchdog, 0);
+	TASK_INIT(&nt->link_cleanup, 0, ntb_transport_link_cleanup_work, nt);
+
+	rc = ntb_set_ctx(dev, nt, &ntb_transport_ops);
+	if (rc != 0)
+		goto err;
+
+	nt->link_is_up = false;
+	ntb_link_enable(dev, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+
+	if (enable_xeon_watchdog != 0)
+		callout_reset(&nt->link_watchdog, 0, xeon_link_watchdog_hb, nt);
+
+	bus_generic_attach(dev);
+	return (0);
+
+err:
+	free(nt->qp_vec, M_NTB_T);
+	free(nt->mw_vec, M_NTB_T);
+	return (rc);
+}
+
+static int
+ntb_transport_detach(device_t dev)
+{
+	struct ntb_transport_ctx *nt = device_get_softc(dev);
+	struct ntb_transport_child **cpp = &nt->child;
+	struct ntb_transport_child *nc;
+	int error = 0, i;
+
+	while ((nc = *cpp) != NULL) {
+		*cpp = (*cpp)->next;
+		error = device_delete_child(dev, nc->dev);
+		if (error)
+			break;
+		free(nc, M_DEVBUF);
+	}
+	KASSERT(nt->qp_bitmap == 0,
+	    ("Some queues not freed on detach (%jx)", nt->qp_bitmap));
+
+	ntb_transport_link_cleanup(nt);
+	taskqueue_drain(taskqueue_swi, &nt->link_cleanup);
+	callout_drain(&nt->link_work);
+	callout_drain(&nt->link_watchdog);
+
+	ntb_link_disable(dev);
+	ntb_clear_ctx(dev);
+
+	for (i = 0; i < nt->mw_count; i++)
+		ntb_free_mw(nt, i);
+
+	free(nt->qp_vec, M_NTB_T);
+	free(nt->mw_vec, M_NTB_T);
+	return (0);
+}
+
+int
+ntb_transport_queue_count(device_t dev)
+{
+	struct ntb_transport_child *nc = device_get_ivars(dev);
+
+	return (nc->qpcnt);
+}
+
+static void
+ntb_transport_init_queue(struct ntb_transport_ctx *nt, unsigned int qp_num)
+{
+	struct ntb_transport_mw *mw;
+	struct ntb_transport_qp *qp;
+	vm_paddr_t mw_base;
+	uint64_t mw_size, qp_offset;
+	size_t tx_size;
+	unsigned num_qps_mw, mw_num, mw_count;
+
+	mw_count = nt->mw_count;
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	qp = &nt->qp_vec[qp_num];
+	qp->qp_num = qp_num;
+	qp->transport = nt;
+	qp->dev = nt->dev;
+	qp->client_ready = false;
+	qp->event_handler = NULL;
+	ntb_qp_link_down_reset(qp);
+
+	if (mw_num < nt->qp_count % mw_count)
+		num_qps_mw = nt->qp_count / mw_count + 1;
+	else
+		num_qps_mw = nt->qp_count / mw_count;
+
+	mw_base = mw->phys_addr;
+	mw_size = mw->phys_size;
+
+	tx_size = mw_size / num_qps_mw;
+	qp_offset = tx_size * (qp_num / mw_count);
+
+	qp->tx_mw = mw->vbase + qp_offset;
+	KASSERT(qp->tx_mw != NULL, ("uh oh?"));
+
+	/* XXX Assumes that a vm_paddr_t is equivalent to bus_addr_t */
+	qp->tx_mw_phys = mw_base + qp_offset;
+	KASSERT(qp->tx_mw_phys != 0, ("uh oh?"));
+
+	tx_size -= sizeof(struct ntb_rx_info);
+	qp->rx_info = (void *)(qp->tx_mw + tx_size);
+
+	/* Due to house-keeping, there must be at least 2 buffs */
+	qp->tx_max_frame = qmin(transport_mtu, tx_size / 2);
+	qp->tx_max_entry = tx_size / qp->tx_max_frame;
+
+	callout_init(&qp->link_work, 0);
+	callout_init(&qp->rx_full, 1);
+
+	mtx_init(&qp->ntb_rx_q_lock, "ntb rx q", NULL, MTX_SPIN);
+	mtx_init(&qp->ntb_tx_free_q_lock, "ntb tx free q", NULL, MTX_SPIN);
+	mtx_init(&qp->tx_lock, "ntb transport tx", NULL, MTX_DEF);
+	TASK_INIT(&qp->rxc_db_work, 0, ntb_transport_rxc_db, qp);
+	qp->rxc_tq = taskqueue_create("ntbt_rx", M_WAITOK,
+	    taskqueue_thread_enqueue, &qp->rxc_tq);
+	taskqueue_start_threads(&qp->rxc_tq, 1, PI_NET, "%s rx%d",
+	    device_get_nameunit(nt->dev), qp_num);
+
+	STAILQ_INIT(&qp->rx_post_q);
+	STAILQ_INIT(&qp->rx_pend_q);
+	STAILQ_INIT(&qp->tx_free_q);
+}
+
+void
+ntb_transport_free_queue(struct ntb_transport_qp *qp)
+{
+	struct ntb_transport_ctx *nt = qp->transport;
+	struct ntb_queue_entry *entry;
+
+	callout_drain(&qp->link_work);
+
+	ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
+	taskqueue_drain_all(qp->rxc_tq);
+	taskqueue_free(qp->rxc_tq);
+
+	qp->cb_data = NULL;
+	qp->rx_handler = NULL;
+	qp->tx_handler = NULL;
+	qp->event_handler = NULL;
+
+	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_pend_q)))
+		free(entry, M_NTB_T);
+
+	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_post_q)))
+		free(entry, M_NTB_T);
+
+	while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
+		free(entry, M_NTB_T);
+
+	nt->qp_bitmap &= ~(1 << qp->qp_num);
+}
+
+/**
+ * ntb_transport_create_queue - Create a new NTB transport layer queue
+ * @rx_handler: receive callback function
+ * @tx_handler: transmit callback function
+ * @event_handler: event callback function
+ *
+ * Create a new NTB transport layer queue and provide the queue with a callback
+ * routine for both transmit and receive.  The receive callback routine will be
+ * used to pass up data when the transport has received it on the queue.   The
+ * transmit callback routine will be called when the transport has completed the
+ * transmission of the data on the queue and the data is ready to be freed.
+ *
+ * RETURNS: pointer to newly created ntb_queue, NULL on error.
+ */
+struct ntb_transport_qp *
+ntb_transport_create_queue(device_t dev, int q,
+    const struct ntb_queue_handlers *handlers, void *data)
+{
+	struct ntb_transport_child *nc = device_get_ivars(dev);
+	struct ntb_transport_ctx *nt = device_get_softc(device_get_parent(dev));
+	struct ntb_queue_entry *entry;
+	struct ntb_transport_qp *qp;
+	int i;
+
+	if (q < 0 || q >= nc->qpcnt)
+		return (NULL);
+
+	qp = &nt->qp_vec[nc->qpoff + q];
+	nt->qp_bitmap |= (1 << qp->qp_num);
+	qp->cb_data = data;
+	qp->rx_handler = handlers->rx_handler;
+	qp->tx_handler = handlers->tx_handler;
+	qp->event_handler = handlers->event_handler;
+
+	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+		entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
+		entry->cb_data = data;
+		entry->buf = NULL;
+		entry->len = transport_mtu;
+		entry->qp = qp;
+		ntb_list_add(&qp->ntb_rx_q_lock, entry, &qp->rx_pend_q);
+	}
+
+	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+		entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
+		entry->qp = qp;
+		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+	}
+
+	ntb_db_clear(dev, 1ull << qp->qp_num);
+	return (qp);
+}
+
+/**
+ * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
+ * @qp: NTB transport layer queue to be enabled
+ *
+ * Notify NTB transport layer of client readiness to use queue
+ */
+void
+ntb_transport_link_up(struct ntb_transport_qp *qp)
+{
+	struct ntb_transport_ctx *nt = qp->transport;
+
+	qp->client_ready = true;
+
+	ntb_printf(2, "qp %d client ready\n", qp->qp_num);
+
+	if (nt->link_is_up)
+		callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+}
+
+
+
+/* Transport Tx */
+
+/**
+ * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
+ * @qp: NTB transport layer queue the entry is to be enqueued on
+ * @cb: per buffer pointer for callback function to use
+ * @data: pointer to data buffer that will be sent
+ * @len: length of the data buffer
+ *
+ * Enqueue a new transmit buffer onto the transport queue from which a NTB
+ * payload will be transmitted.  This assumes that a lock is being held to
+ * serialize access to the qp.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int
+ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+    unsigned int len)
+{
+	struct ntb_queue_entry *entry;
+	int rc;
+
+	if (!qp->link_is_up || len == 0) {
+		CTR0(KTR_NTB, "TX: link not up");
+		return (EINVAL);
+	}
+
+	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
+	if (entry == NULL) {
+		CTR0(KTR_NTB, "TX: could not get entry from tx_free_q");
+		qp->tx_err_no_buf++;
+		return (EBUSY);
+	}
+	CTR1(KTR_NTB, "TX: got entry %p from tx_free_q", entry);
+
+	entry->cb_data = cb;
+	entry->buf = data;
+	entry->len = len;
+	entry->flags = 0;
+
+	mtx_lock(&qp->tx_lock);
+	rc = ntb_process_tx(qp, entry);
+	mtx_unlock(&qp->tx_lock);
+	if (rc != 0) {
+		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+		CTR1(KTR_NTB,
+		    "TX: process_tx failed. Returning entry %p to tx_free_q",
+		    entry);
+	}
+	return (rc);
+}
+
+static void
+ntb_tx_copy_callback(void *data)
+{
+	struct ntb_queue_entry *entry = data;
+	struct ntb_transport_qp *qp = entry->qp;
+	struct ntb_payload_header *hdr = entry->x_hdr;
+
+	iowrite32(entry->flags | NTBT_DESC_DONE_FLAG, &hdr->flags);
+	CTR1(KTR_NTB, "TX: hdr %p set DESC_DONE", hdr);
+
+	ntb_peer_db_set(qp->dev, 1ull << qp->qp_num);
+
+	/*
+	 * The entry length can only be zero if the packet is intended to be a
+	 * "link down" or similar.  Since no payload is being sent in these
+	 * cases, there is nothing to add to the completion queue.
+	 */
+	if (entry->len > 0) {
+		qp->tx_bytes += entry->len;
+
+		if (qp->tx_handler)
+			qp->tx_handler(qp, qp->cb_data, entry->buf,
+			    entry->len);
+		else
+			m_freem(entry->buf);
+		entry->buf = NULL;
+	}
+
+	CTR3(KTR_NTB,
+	    "TX: entry %p sent. hdr->ver = %u, hdr->flags = 0x%x, Returning "
+	    "to tx_free_q", entry, hdr->ver, hdr->flags);
+	ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+}
+
+static void
+ntb_memcpy_tx(struct ntb_queue_entry *entry, void *offset)
+{
+
+	CTR2(KTR_NTB, "TX: copying %d bytes to offset %p", entry->len, offset);
+	if (entry->buf != NULL) {
+		m_copydata((struct mbuf *)entry->buf, 0, entry->len, offset);
+
+		/*
+		 * Ensure that the data is fully copied before setting the
+		 * flags
+		 */
+		wmb();
+	}
+
+	ntb_tx_copy_callback(entry);
+}
+
+static void
+ntb_async_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
+{
+	struct ntb_payload_header *hdr;
+	void *offset;
+
+	offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
+	hdr = (struct ntb_payload_header *)((char *)offset + qp->tx_max_frame -
+	    sizeof(struct ntb_payload_header));
+	entry->x_hdr = hdr;
+
+	iowrite32(entry->len, &hdr->len);
+	iowrite32(qp->tx_pkts, &hdr->ver);
+
+	ntb_memcpy_tx(entry, offset);
+}
+
+static int
+ntb_process_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
+{
+
+	CTR3(KTR_NTB,
+	    "TX: process_tx: tx_pkts=%lu, tx_index=%u, remote entry=%u",
+	    qp->tx_pkts, qp->tx_index, qp->remote_rx_info->entry);
+	if (qp->tx_index == qp->remote_rx_info->entry) {
+		CTR0(KTR_NTB, "TX: ring full");
+		qp->tx_ring_full++;
+		return (EAGAIN);
+	}
+
+	if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
+		if (qp->tx_handler != NULL)
+			qp->tx_handler(qp, qp->cb_data, entry->buf,
+			    EIO);
+		else
+			m_freem(entry->buf);
+
+		entry->buf = NULL;
+		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+		CTR1(KTR_NTB,
+		    "TX: frame too big. returning entry %p to tx_free_q",
+		    entry);
+		return (0);
+	}
+	CTR2(KTR_NTB, "TX: copying entry %p to index %u", entry, qp->tx_index);
+	ntb_async_tx(qp, entry);
+
+	qp->tx_index++;
+	qp->tx_index %= qp->tx_max_entry;
+
+	qp->tx_pkts++;
+
+	return (0);
+}
+
+/* Transport Rx */
+static void
+ntb_transport_rxc_db(void *arg, int pending __unused)
+{
+	struct ntb_transport_qp *qp = arg;
+	int rc;
+
+	CTR0(KTR_NTB, "RX: transport_rx");
+again:
+	while ((rc = ntb_process_rxc(qp)) == 0)
+		;
+	CTR1(KTR_NTB, "RX: process_rxc returned %d", rc);
+
+	if ((ntb_db_read(qp->dev) & (1ull << qp->qp_num)) != 0) {
+		/* If db is set, clear it and check queue once more. */
+		ntb_db_clear(qp->dev, 1ull << qp->qp_num);
+		goto again;
+	}
+}
+
+static int
+ntb_process_rxc(struct ntb_transport_qp *qp)
+{
+	struct ntb_payload_header *hdr;
+	struct ntb_queue_entry *entry;
+	caddr_t offset;
+
+	offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
+	hdr = (void *)(offset + qp->rx_max_frame -
+	    sizeof(struct ntb_payload_header));
+
+	CTR1(KTR_NTB, "RX: process_rxc rx_index = %u", qp->rx_index);
+	if ((hdr->flags & NTBT_DESC_DONE_FLAG) == 0) {
+		CTR0(KTR_NTB, "RX: hdr not done");
+		qp->rx_ring_empty++;
+		return (EAGAIN);
+	}
+
+	if ((hdr->flags & NTBT_LINK_DOWN_FLAG) != 0) {
+		CTR0(KTR_NTB, "RX: link down");
+		ntb_qp_link_down(qp);
+		hdr->flags = 0;
+		return (EAGAIN);
+	}
+
+	if (hdr->ver != (uint32_t)qp->rx_pkts) {
+		CTR2(KTR_NTB,"RX: ver != rx_pkts (%x != %lx). "
+		    "Returning entry to rx_pend_q", hdr->ver, qp->rx_pkts);
+		qp->rx_err_ver++;
+		return (EIO);
+	}
+
+	entry = ntb_list_mv(&qp->ntb_rx_q_lock, &qp->rx_pend_q, &qp->rx_post_q);
+	if (entry == NULL) {
+		qp->rx_err_no_buf++;
+		CTR0(KTR_NTB, "RX: No entries in rx_pend_q");
+		return (EAGAIN);
+	}
+	callout_stop(&qp->rx_full);
+	CTR1(KTR_NTB, "RX: rx entry %p from rx_pend_q", entry);
+
+	entry->x_hdr = hdr;
+	entry->index = qp->rx_index;
+
+	if (hdr->len > entry->len) {
+		CTR2(KTR_NTB, "RX: len too long. Wanted %ju got %ju",
+		    (uintmax_t)hdr->len, (uintmax_t)entry->len);
+		qp->rx_err_oflow++;
+
+		entry->len = -EIO;
+		entry->flags |= NTBT_DESC_DONE_FLAG;
+
+		ntb_complete_rxc(qp);
+	} else {
+		qp->rx_bytes += hdr->len;
+		qp->rx_pkts++;
+
+		CTR1(KTR_NTB, "RX: received %ld rx_pkts", qp->rx_pkts);
+
+		entry->len = hdr->len;
+
+		ntb_memcpy_rx(qp, entry, offset);
+	}
+
+	qp->rx_index++;
+	qp->rx_index %= qp->rx_max_entry;
+	return (0);
+}
+
+static void
+ntb_memcpy_rx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
+    void *offset)
+{
+	struct ifnet *ifp = entry->cb_data;
+	unsigned int len = entry->len;
+
+	CTR2(KTR_NTB, "RX: copying %d bytes from offset %p", len, offset);
+
+	entry->buf = (void *)m_devget(offset, len, 0, ifp, NULL);
+	if (entry->buf == NULL)
+		entry->len = -ENOMEM;
+
+	/* Ensure that the data is globally visible before clearing the flag */
+	wmb();
+
+	CTR2(KTR_NTB, "RX: copied entry %p to mbuf %p.", entry, entry->buf);
+	ntb_rx_copy_callback(qp, entry);
+}
+
+static inline void
+ntb_rx_copy_callback(struct ntb_transport_qp *qp, void *data)
+{
+	struct ntb_queue_entry *entry;
+
+	entry = data;
+	entry->flags |= NTBT_DESC_DONE_FLAG;
+	ntb_complete_rxc(qp);
+}
+
+static void
+ntb_complete_rxc(struct ntb_transport_qp *qp)
+{
+	struct ntb_queue_entry *entry;
+	struct mbuf *m;
+	unsigned len;
+
+	CTR0(KTR_NTB, "RX: rx_completion_task");
+
+	mtx_lock_spin(&qp->ntb_rx_q_lock);
+
+	while (!STAILQ_EMPTY(&qp->rx_post_q)) {
+		entry = STAILQ_FIRST(&qp->rx_post_q);
+		if ((entry->flags & NTBT_DESC_DONE_FLAG) == 0)
+			break;
+
+		entry->x_hdr->flags = 0;
+		iowrite32(entry->index, &qp->rx_info->entry);
+
+		STAILQ_REMOVE_HEAD(&qp->rx_post_q, entry);
+
+		len = entry->len;
+		m = entry->buf;
+
+		/*
+		 * Re-initialize queue_entry for reuse; rx_handler takes
+		 * ownership of the mbuf.
+		 */
+		entry->buf = NULL;
+		entry->len = transport_mtu;
+		entry->cb_data = qp->cb_data;
+
+		STAILQ_INSERT_TAIL(&qp->rx_pend_q, entry, entry);
+
+		mtx_unlock_spin(&qp->ntb_rx_q_lock);
+
+		CTR2(KTR_NTB, "RX: completing entry %p, mbuf %p", entry, m);
+		if (qp->rx_handler != NULL && qp->client_ready)
+			qp->rx_handler(qp, qp->cb_data, m, len);
+		else
+			m_freem(m);
+
+		mtx_lock_spin(&qp->ntb_rx_q_lock);
+	}
+
+	mtx_unlock_spin(&qp->ntb_rx_q_lock);
+}
+
+static void
+ntb_transport_doorbell_callback(void *data, uint32_t vector)
+{
+	struct ntb_transport_ctx *nt = data;
+	struct ntb_transport_qp *qp;
+	uint64_t vec_mask;
+	unsigned qp_num;
+
+	vec_mask = ntb_db_vector_mask(nt->dev, vector);
+	vec_mask &= nt->qp_bitmap;
+	if ((vec_mask & (vec_mask - 1)) != 0)
+		vec_mask &= ntb_db_read(nt->dev);
+	while (vec_mask != 0) {
+		qp_num = ffsll(vec_mask) - 1;
+
+		qp = &nt->qp_vec[qp_num];
+		if (qp->link_is_up)
+			taskqueue_enqueue(qp->rxc_tq, &qp->rxc_db_work);
+
+		vec_mask &= ~(1ull << qp_num);
+	}
+}
+
+/* Link Event handler */
+static void
+ntb_transport_event_callback(void *data)
+{
+	struct ntb_transport_ctx *nt = data;
+
+	if (ntb_link_is_up(nt->dev, NULL, NULL)) {
+		ntb_printf(1, "HW link up\n");
+		callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
+	} else {
+		ntb_printf(1, "HW link down\n");
+		taskqueue_enqueue(taskqueue_swi, &nt->link_cleanup);
+	}
+}
+
+/* Link bring up */
+static void
+ntb_transport_link_work(void *arg)
+{
+	struct ntb_transport_ctx *nt = arg;
+	device_t dev = nt->dev;
+	struct ntb_transport_qp *qp;
+	uint64_t val64, size;
+	uint32_t val;
+	unsigned i;
+	int rc;
+
+	/* send the local info, in the opposite order of the way we read it */
+	for (i = 0; i < nt->mw_count; i++) {
+		size = nt->mw_vec[i].phys_size;
+
+		if (max_mw_size != 0 && size > max_mw_size)
+			size = max_mw_size;
+
+		ntb_peer_spad_write(dev, NTBT_MW0_SZ_HIGH + (i * 2),
+		    size >> 32);
+		ntb_peer_spad_write(dev, NTBT_MW0_SZ_LOW + (i * 2), size);
+	}
+	ntb_peer_spad_write(dev, NTBT_NUM_MWS, nt->mw_count);
+	ntb_peer_spad_write(dev, NTBT_NUM_QPS, nt->qp_count);
+	ntb_peer_spad_write(dev, NTBT_QP_LINKS, 0);
+	ntb_peer_spad_write(dev, NTBT_VERSION, NTB_TRANSPORT_VERSION);
+
+	/* Query the remote side for its info */
+	val = 0;
+	ntb_spad_read(dev, NTBT_VERSION, &val);
+	if (val != NTB_TRANSPORT_VERSION)
+		goto out;
+
+	ntb_spad_read(dev, NTBT_NUM_QPS, &val);
+	if (val != nt->qp_count)
+		goto out;
+
+	ntb_spad_read(dev, NTBT_NUM_MWS, &val);
+	if (val != nt->mw_count)
+		goto out;
+
+	for (i = 0; i < nt->mw_count; i++) {
+		ntb_spad_read(dev, NTBT_MW0_SZ_HIGH + (i * 2), &val);
+		val64 = (uint64_t)val << 32;
+
+		ntb_spad_read(dev, NTBT_MW0_SZ_LOW + (i * 2), &val);
+		val64 |= val;
+
+		rc = ntb_set_mw(nt, i, val64);
+		if (rc != 0)
+			goto free_mws;
+	}
+
+	nt->link_is_up = true;
+	ntb_printf(1, "transport link up\n");
+
+	for (i = 0; i < nt->qp_count; i++) {
+		qp = &nt->qp_vec[i];
+
+		ntb_transport_setup_qp_mw(nt, i);
+
+		if (qp->client_ready)
+			callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+	}
+
+	return;
+
+free_mws:
+	for (i = 0; i < nt->mw_count; i++)
+		ntb_free_mw(nt, i);
+out:
+	if (ntb_link_is_up(dev, NULL, NULL))
+		callout_reset(&nt->link_work,
+		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_transport_link_work, nt);
+}
+
+static int
+ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, size_t size)
+{
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	size_t xlat_size, buff_size;
+	int rc;
+
+	if (size == 0)
+		return (EINVAL);
+
+	xlat_size = roundup(size, mw->xlat_align_size);
+	buff_size = xlat_size;
+
+	/* No need to re-setup */
+	if (mw->xlat_size == xlat_size)
+		return (0);
+
+	if (mw->buff_size != 0)
+		ntb_free_mw(nt, num_mw);
+
+	/* Alloc memory for receiving data.  Must be aligned */
+	mw->xlat_size = xlat_size;
+	mw->buff_size = buff_size;
+
+	mw->virt_addr = contigmalloc(mw->buff_size, M_NTB_T, M_ZERO, 0,
+	    mw->addr_limit, mw->xlat_align, 0);
+	if (mw->virt_addr == NULL) {
+		ntb_printf(0, "Unable to allocate MW buffer of size %zu/%zu\n",
+		    mw->buff_size, mw->xlat_size);
+		mw->xlat_size = 0;
+		mw->buff_size = 0;
+		return (ENOMEM);
+	}
+	/* TODO: replace with bus_space_* functions */
+	mw->dma_addr = vtophys(mw->virt_addr);
+
+	/*
+	 * Ensure that the allocation from contigmalloc is aligned as
+	 * requested.  XXX: This may not be needed -- brought in for parity
+	 * with the Linux driver.
+	 */
+	if (mw->dma_addr % mw->xlat_align != 0) {
+		ntb_printf(0,
+		    "DMA memory 0x%jx not aligned to BAR size 0x%zx\n",
+		    (uintmax_t)mw->dma_addr, size);
+		ntb_free_mw(nt, num_mw);
+		return (ENOMEM);
+	}
+
+	/* Notify HW the memory location of the receive buffer */
+	rc = ntb_mw_set_trans(nt->dev, num_mw, mw->dma_addr, mw->xlat_size);
+	if (rc) {
+		ntb_printf(0, "Unable to set mw%d translation\n", num_mw);
+		ntb_free_mw(nt, num_mw);
+		return (rc);
+	}
+
+	return (0);
+}
+
+static void
+ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
+{
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+
+	if (mw->virt_addr == NULL)
+		return;
+
+	ntb_mw_clear_trans(nt->dev, num_mw);
+	contigfree(mw->virt_addr, mw->xlat_size, M_NTB_T);
+	mw->xlat_size = 0;
+	mw->buff_size = 0;
+	mw->virt_addr = NULL;
+}
+
+static int
+ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, unsigned int qp_num)
+{
+	struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
+	struct ntb_transport_mw *mw;
+	void *offset;
+	ntb_q_idx_t i;
+	size_t rx_size;
+	unsigned num_qps_mw, mw_num, mw_count;
+
+	mw_count = nt->mw_count;
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	if (mw->virt_addr == NULL)
+		return (ENOMEM);
+
+	if (mw_num < nt->qp_count % mw_count)
+		num_qps_mw = nt->qp_count / mw_count + 1;
+	else
+		num_qps_mw = nt->qp_count / mw_count;
+
+	rx_size = mw->xlat_size / num_qps_mw;
+	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
+	rx_size -= sizeof(struct ntb_rx_info);
+
+	qp->remote_rx_info = (void*)(qp->rx_buff + rx_size);
+
+	/* Due to house-keeping, there must be at least 2 buffs */
+	qp->rx_max_frame = qmin(transport_mtu, rx_size / 2);
+	qp->rx_max_entry = rx_size / qp->rx_max_frame;
+	qp->rx_index = 0;
+
+	qp->remote_rx_info->entry = qp->rx_max_entry - 1;
+
+	/* Set up the hdr offsets with 0s */
+	for (i = 0; i < qp->rx_max_entry; i++) {
+		offset = (void *)(qp->rx_buff + qp->rx_max_frame * (i + 1) -
+		    sizeof(struct ntb_payload_header));
+		memset(offset, 0, sizeof(struct ntb_payload_header));
+	}
+
+	qp->rx_pkts = 0;
+	qp->tx_pkts = 0;
+	qp->tx_index = 0;
+
+	return (0);
+}
+
+static void
+ntb_qp_link_work(void *arg)
+{
+	struct ntb_transport_qp *qp = arg;
+	device_t dev = qp->dev;
+	struct ntb_transport_ctx *nt = qp->transport;
+	int i;
+	uint32_t val;
+
+	/* Report queues that are up on our side */
+	for (i = 0, val = 0; i < nt->qp_count; i++) {
+		if (nt->qp_vec[i].client_ready)
+			val |= (1 << i);
+	}
+	ntb_peer_spad_write(dev, NTBT_QP_LINKS, val);
+
+	/* See if the remote side is up */
+	ntb_spad_read(dev, NTBT_QP_LINKS, &val);
+	if ((val & (1ull << qp->qp_num)) != 0) {
+		ntb_printf(2, "qp %d link up\n", qp->qp_num);
+		qp->link_is_up = true;
+
+		if (qp->event_handler != NULL)
+			qp->event_handler(qp->cb_data, NTB_LINK_UP);
+
+		ntb_db_clear_mask(dev, 1ull << qp->qp_num);
+	} else if (nt->link_is_up)
+		callout_reset(&qp->link_work,
+		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
+}
+
+/* Link down event*/
+static void
+ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
+{
+	struct ntb_transport_qp *qp;
+	int i;
+
+	/* Pass along the info to any clients */
+	for (i = 0; i < nt->qp_count; i++) {
+		if ((nt->qp_bitmap & (1 << i)) != 0) {
+			qp = &nt->qp_vec[i];
+			ntb_qp_link_cleanup(qp);
+			callout_drain(&qp->link_work);
+		}
+	}
+
+	if (!nt->link_is_up)
+		callout_drain(&nt->link_work);
+
+	/*
+	 * The scratchpad registers keep the values if the remote side
+	 * goes down, blast them now to give them a sane value the next
+	 * time they are accessed
+	 */
+	ntb_spad_clear(nt->dev);
+}
+
+static void
+ntb_transport_link_cleanup_work(void *arg, int pending __unused)
+{
+
+	ntb_transport_link_cleanup(arg);
+}
+
+static void
+ntb_qp_link_down(struct ntb_transport_qp *qp)
+{
+
+	ntb_qp_link_cleanup(qp);
+}
+
+static void
+ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
+{
+
+	qp->link_is_up = false;
+	ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
+
+	qp->tx_index = qp->rx_index = 0;
+	qp->tx_bytes = qp->rx_bytes = 0;
+	qp->tx_pkts = qp->rx_pkts = 0;
+
+	qp->rx_ring_empty = 0;
+	qp->tx_ring_full = 0;
+
+	qp->rx_err_no_buf = qp->tx_err_no_buf = 0;
+	qp->rx_err_oflow = qp->rx_err_ver = 0;
+}
+
+static void
+ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
+{
+
+	callout_drain(&qp->link_work);
+	ntb_qp_link_down_reset(qp);
+
+	if (qp->event_handler != NULL)
+		qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
+}
+
+/* Link commanded down */
+/**
+ * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
+ * @qp: NTB transport layer queue to be disabled
+ *
+ * Notify NTB transport layer of client's desire to no longer receive data on
+ * transport queue specified.  It is the client's responsibility to ensure all
+ * entries on queue are purged or otherwise handled appropriately.
+ */
+void
+ntb_transport_link_down(struct ntb_transport_qp *qp)
+{
+	struct ntb_transport_ctx *nt = qp->transport;
+	int i;
+	uint32_t val;
+
+	qp->client_ready = false;
+	for (i = 0, val = 0; i < nt->qp_count; i++) {
+		if (nt->qp_vec[i].client_ready)
+			val |= (1 << i);
+	}
+	ntb_peer_spad_write(qp->dev, NTBT_QP_LINKS, val);
+
+	if (qp->link_is_up)
+		ntb_send_link_down(qp);
+	else
+		callout_drain(&qp->link_work);
+}
+
+/**
+ * ntb_transport_link_query - Query transport link state
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query connectivity to the remote system of the NTB transport queue
+ *
+ * RETURNS: true for link up or false for link down
+ */
+bool
+ntb_transport_link_query(struct ntb_transport_qp *qp)
+{
+
+	return (qp->link_is_up);
+}
+
+static void
+ntb_send_link_down(struct ntb_transport_qp *qp)
+{
+	struct ntb_queue_entry *entry;
+	int i, rc;
+
+	if (!qp->link_is_up)
+		return;
+
+	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
+		entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
+		if (entry != NULL)
+			break;
+		pause("NTB Wait for link down", hz / 10);
+	}
+
+	if (entry == NULL)
+		return;
+
+	entry->cb_data = NULL;
+	entry->buf = NULL;
+	entry->len = 0;
+	entry->flags = NTBT_LINK_DOWN_FLAG;
+
+	mtx_lock(&qp->tx_lock);
+	rc = ntb_process_tx(qp, entry);
+	mtx_unlock(&qp->tx_lock);
+	if (rc != 0)
+		printf("ntb: Failed to send link down\n");
+
+	ntb_qp_link_down_reset(qp);
+}
+
+
+/* List Management */
+
+static void
+ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
+    struct ntb_queue_list *list)
+{
+
+	mtx_lock_spin(lock);
+	STAILQ_INSERT_TAIL(list, entry, entry);
+	mtx_unlock_spin(lock);
+}
+
+static struct ntb_queue_entry *
+ntb_list_rm(struct mtx *lock, struct ntb_queue_list *list)
+{
+	struct ntb_queue_entry *entry;
+
+	mtx_lock_spin(lock);
+	if (STAILQ_EMPTY(list)) {
+		entry = NULL;
+		goto out;
+	}
+	entry = STAILQ_FIRST(list);
+	STAILQ_REMOVE_HEAD(list, entry);
+out:
+	mtx_unlock_spin(lock);
+
+	return (entry);
+}
+
+static struct ntb_queue_entry *
+ntb_list_mv(struct mtx *lock, struct ntb_queue_list *from,
+    struct ntb_queue_list *to)
+{
+	struct ntb_queue_entry *entry;
+
+	mtx_lock_spin(lock);
+	if (STAILQ_EMPTY(from)) {
+		entry = NULL;
+		goto out;
+	}
+	entry = STAILQ_FIRST(from);
+	STAILQ_REMOVE_HEAD(from, entry);
+	STAILQ_INSERT_TAIL(to, entry, entry);
+
+out:
+	mtx_unlock_spin(lock);
+	return (entry);
+}
+
+/**
+ * ntb_transport_qp_num - Query the qp number
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query qp number of the NTB transport queue
+ *
+ * RETURNS: a zero based number specifying the qp number
+ */
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp)
+{
+
+	return (qp->qp_num);
+}
+
+/**
+ * ntb_transport_max_size - Query the max payload size of a qp
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query the maximum payload size permissible on the given qp
+ *
+ * RETURNS: the max payload size of a qp
+ */
+unsigned int
+ntb_transport_max_size(struct ntb_transport_qp *qp)
+{
+
+	return (qp->tx_max_frame - sizeof(struct ntb_payload_header));
+}
+
+unsigned int
+ntb_transport_tx_free_entry(struct ntb_transport_qp *qp)
+{
+	unsigned int head = qp->tx_index;
+	unsigned int tail = qp->remote_rx_info->entry;
+
+	return (tail >= head ? tail - head : qp->tx_max_entry + tail - head);
+}
+
+static device_method_t ntb_transport_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,     ntb_transport_probe),
+	DEVMETHOD(device_attach,    ntb_transport_attach),
+	DEVMETHOD(device_detach,    ntb_transport_detach),
+	DEVMETHOD_END
+};
+
+devclass_t ntb_transport_devclass;
+static DEFINE_CLASS_0(ntb_transport, ntb_transport_driver,
+    ntb_transport_methods, sizeof(struct ntb_transport_ctx));
+DRIVER_MODULE(ntb_transport, ntb_hw, ntb_transport_driver,
+    ntb_transport_devclass, NULL, NULL);
+MODULE_DEPEND(ntb_transport, ntb, 1, 1, 1);
+MODULE_VERSION(ntb_transport, 1);
diff --git a/sys/dev/ntb/ntb_transport.h b/sys/dev/ntb/ntb_transport.h
new file mode 100644
index 0000000..63cdbce
--- /dev/null
+++ b/sys/dev/ntb/ntb_transport.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+struct ntb_transport_qp;
+
+extern devclass_t ntb_transport_devclass;
+
+enum ntb_link_event {
+	NTB_LINK_DOWN = 0,
+	NTB_LINK_UP,
+};
+
+struct ntb_queue_handlers {
+	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	void (*event_handler)(void *data, enum ntb_link_event status);
+};
+
+int ntb_transport_queue_count(device_t dev);
+struct ntb_transport_qp *
+ntb_transport_create_queue(device_t dev, int q,
+    const struct ntb_queue_handlers *handlers, void *data);
+void ntb_transport_free_queue(struct ntb_transport_qp *qp);
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
+int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
+void ntb_transport_link_up(struct ntb_transport_qp *qp);
+void ntb_transport_link_down(struct ntb_transport_qp *qp);
+bool ntb_transport_link_query(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp);
diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index 4ceb075..d70aaad 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -1744,7 +1744,7 @@ pci_remap_msix_method(device_t dev, device_t child, int count,
 	for (i = 0; i < count; i++) {
 		if (vectors[i] == 0)
 			continue;
-		irq = msix->msix_vectors[vectors[i]].mv_irq;
+		irq = msix->msix_vectors[vectors[i] - 1].mv_irq;
 		resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq,
 		    irq, 1);
 	}
@@ -1758,7 +1758,7 @@ pci_remap_msix_method(device_t dev, device_t child, int count,
 				printf("---");
 			else
 				printf("%d",
-				    msix->msix_vectors[vectors[i]].mv_irq);
+				    msix->msix_vectors[vectors[i] - 1].mv_irq);
 		}
 		printf("\n");
 	}
diff --git a/sys/dev/sfxge/sfxge_ev.c b/sys/dev/sfxge/sfxge_ev.c
index d5aff5f..06ffed2 100644
--- a/sys/dev/sfxge/sfxge_ev.c
+++ b/sys/dev/sfxge/sfxge_ev.c
@@ -448,7 +448,7 @@ sfxge_ev_stat_update(struct sfxge_softc *sc)
 		goto out;
 
 	now = ticks;
-	if (now - sc->ev_stats_update_time < hz)
+	if ((unsigned int)(now - sc->ev_stats_update_time) < (unsigned int)hz)
 		goto out;
 
 	sc->ev_stats_update_time = now;
diff --git a/sys/dev/sfxge/sfxge_port.c b/sys/dev/sfxge/sfxge_port.c
index 709ed78..a4f671f 100644
--- a/sys/dev/sfxge/sfxge_port.c
+++ b/sys/dev/sfxge/sfxge_port.c
@@ -62,7 +62,7 @@ sfxge_mac_stat_update(struct sfxge_softc *sc)
 	}
 
 	now = ticks;
-	if (now - port->mac_stats.update_time < hz) {
+	if ((unsigned int)(now - port->mac_stats.update_time) < (unsigned int)hz) {
 		rc = 0;
 		goto out;
 	}
@@ -543,7 +543,7 @@ sfxge_phy_stat_update(struct sfxge_softc *sc)
 	}
 
 	now = ticks;
-	if (now - port->phy_stats.update_time < hz) {
+	if ((unsigned int)(now - port->phy_stats.update_time) < (unsigned int)hz) {
 		rc = 0;
 		goto out;
 	}
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 4d96840..8fa6bcd 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -728,6 +728,7 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
 	if (flags & RFPPWAIT) {
 		td->td_pflags |= TDP_RFPPWAIT;
 		td->td_rfppwait_p = p2;
+		td->td_dbgflags |= TDB_VFORK;
 	}
 	PROC_UNLOCK(p2);
 	if ((flags & RFSTOPPED) == 0) {
@@ -1063,7 +1064,7 @@ fork_return(struct thread *td, struct trapframe *frame)
 			 * parent's children, do it now.
 			 */
 			dbg = p->p_pptr->p_pptr;
-			proc_set_traced(p);
+			proc_set_traced(p, true);
 			CTR2(KTR_PTRACE,
 		    "fork_return: attaching to new child pid %d: oppid %d",
 			    p->p_pid, p->p_oppid);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 2c37d76..75121b5 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -2510,7 +2510,7 @@ ptracestop(struct thread *td, int sig)
 		 * a chance to report itself upon the next iteration.
 		 */
 		if ((td->td_dbgflags & TDB_FSTP) != 0 ||
-		    ((p->p_flag & P2_PTRACE_FSTP) == 0 &&
+		    ((p->p_flag2 & P2_PTRACE_FSTP) == 0 &&
 		    p->p_xthread == NULL)) {
 			p->p_xstat = sig;
 			p->p_xthread = td;
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
index f2b83f0..201d876 100644
--- a/sys/kern/subr_syscall.c
+++ b/sys/kern/subr_syscall.c
@@ -249,5 +249,13 @@ again:
 			cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz);
 		}
 		PROC_UNLOCK(p2);
+
+		if (td->td_dbgflags & TDB_VFORK) {
+			PROC_LOCK(p);
+			if (p->p_ptevents & PTRACE_VFORK)
+				ptracestop(td, SIGTRAP);
+			td->td_dbgflags &= ~TDB_VFORK;
+			PROC_UNLOCK(p);
+		}
 	}
 }
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index c4533ce..b2dbf72 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -649,12 +649,13 @@ sys_ptrace(struct thread *td, struct ptrace_args *uap)
 #endif
 
 void
-proc_set_traced(struct proc *p)
+proc_set_traced(struct proc *p, bool stop)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_TRACED;
-	p->p_flag2 |= P2_PTRACE_FSTP;
+	if (stop)
+		p->p_flag2 |= P2_PTRACE_FSTP;
 	p->p_ptevents = PTRACE_DEFAULT;
 	p->p_oppid = p->p_pptr->p_pid;
 }
@@ -867,7 +868,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 	switch (req) {
 	case PT_TRACE_ME:
 		/* set my trace flag and "owner" so it can read/write me */
-		proc_set_traced(p);
+		proc_set_traced(p, false);
 		if (p->p_flag & P_PPWAIT)
 			p->p_flag |= P_PPTRACE;
 		CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid);
@@ -884,7 +885,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 		 * The old parent is remembered so we can put things back
 		 * on a "detach".
 		 */
-		proc_set_traced(p);
+		proc_set_traced(p, true);
 		if (p->p_pptr != td->td_proc) {
 			proc_reparent(p, td->td_proc);
 		}
@@ -957,7 +958,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 		}
 		tmp = *(int *)addr;
 		if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX |
-		    PTRACE_FORK | PTRACE_LWP)) != 0) {
+		    PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) {
 			error = EINVAL;
 			break;
 		}
@@ -1296,7 +1297,11 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 		if (td2->td_dbgflags & TDB_FORK) {
 			pl->pl_flags |= PL_FLAG_FORKED;
 			pl->pl_child_pid = td2->td_dbg_forked;
-		}
+			if (td2->td_dbgflags & TDB_VFORK)
+				pl->pl_flags |= PL_FLAG_VFORKED;
+		} else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) ==
+		    TDB_VFORK)
+			pl->pl_flags |= PL_FLAG_VFORK_DONE;
 		if (td2->td_dbgflags & TDB_CHILD)
 			pl->pl_flags |= PL_FLAG_CHILD;
 		if (td2->td_dbgflags & TDB_BORN)
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 89b7a00..0fa87f9 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -1582,7 +1582,7 @@ static struct aiocb_ops aiocb_ops_osigevent = {
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
-	int type, struct aiocb_ops *ops)
+    int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	cap_rights_t rights;
@@ -2568,14 +2568,9 @@ static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
     struct aiocb_ops *ops)
 {
-	struct proc *p = td->td_proc;
-	struct kaioinfo *ki;
 
 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
-	ki = p->p_aioinfo;
-	if (ki == NULL)
-		aio_init_aioinfo(p);
 	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
 }
 
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 94b8149..166ed65 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -635,7 +635,6 @@ int
 vop_stdfsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
-		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
diff --git a/sys/modules/ntb/Makefile b/sys/modules/ntb/Makefile
index a5169a0..3eaf751 100644
--- a/sys/modules/ntb/Makefile
+++ b/sys/modules/ntb/Makefile
@@ -1,5 +1,5 @@
 # $FreeBSD$
 
-SUBDIR=	ntb_hw if_ntb
+SUBDIR=	ntb ntb_hw ntb_transport if_ntb
 
 .include <bsd.subdir.mk>
diff --git a/sys/modules/ntb/ntb/Makefile b/sys/modules/ntb/ntb/Makefile
new file mode 100644
index 0000000..a343f28
--- /dev/null
+++ b/sys/modules/ntb/ntb/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:  ${.CURDIR}/../../../dev/ntb
+
+KMOD    = ntb
+SRCS    = ntb.c ntb_if.c
+SRCS += device_if.h bus_if.h ntb_if.h
+
+MFILES=	kern/bus_if.m kern/device_if.m dev/ntb/ntb_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/ntb/ntb_hw/Makefile b/sys/modules/ntb/ntb_hw/Makefile
index fc46b46..5240411 100644
--- a/sys/modules/ntb/ntb_hw/Makefile
+++ b/sys/modules/ntb/ntb_hw/Makefile
@@ -4,6 +4,8 @@
 
 KMOD    = ntb_hw
 SRCS    = ntb_hw.c
-SRCS += device_if.h bus_if.h pci_if.h
+SRCS += device_if.h bus_if.h pci_if.h ntb_if.h
+
+MFILES=	kern/bus_if.m kern/device_if.m dev/pci/pci_if.m dev/ntb/ntb_if.m
 
 .include <bsd.kmod.mk>
diff --git a/sys/modules/ntb/ntb_transport/Makefile b/sys/modules/ntb/ntb_transport/Makefile
new file mode 100644
index 0000000..5055600
--- /dev/null
+++ b/sys/modules/ntb/ntb_transport/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:  ${.CURDIR}/../../../dev/ntb
+
+KMOD    = ntb_transport
+SRCS    = ntb_transport.c
+SRCS += device_if.h bus_if.h ntb_if.h
+
+MFILES=	kern/bus_if.m kern/device_if.m dev/ntb/ntb_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index 14d9967..57aadc0 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -165,7 +165,8 @@ __FBSDID("$FreeBSD$");
 /*
  * List of capabilities to possibly mask on the member interface.
  */
-#define	BRIDGE_IFCAPS_MASK		(IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM)
+#define	BRIDGE_IFCAPS_MASK		(IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM|\
+					 IFCAP_TXCSUM_IPV6)
 
 /*
  * List of capabilities to strip
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index a1ceaab..8c56a13 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -2758,8 +2758,8 @@ pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
 	switch (af) {
 #ifdef INET
 	case AF_INET:
-		if ((a->addr32[0] < b->addr32[0]) ||
-		    (a->addr32[0] > e->addr32[0]))
+		if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
+		    (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
 			return (0);
 		break;
 #endif /* INET */
@@ -2769,15 +2769,15 @@ pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
 
 		/* check a >= b */
 		for (i = 0; i < 4; ++i)
-			if (a->addr32[i] > b->addr32[i])
+			if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
 				break;
-			else if (a->addr32[i] < b->addr32[i])
+			else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
 				return (0);
 		/* check a <= e */
 		for (i = 0; i < 4; ++i)
-			if (a->addr32[i] < e->addr32[i])
+			if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
 				break;
-			else if (a->addr32[i] > e->addr32[i])
+			else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
 				return (0);
 		break;
 	}
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 4b6c601..07f69c6 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -58,7 +58,7 @@
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1003505	/* Master, propagated to newvers */
+#define __FreeBSD_version 1003506	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 1b8bda5..59c75c5 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -398,6 +398,7 @@ do {									\
 #define	TDB_CHILD	0x00000100 /* New child indicator for ptrace() */
 #define	TDB_BORN	0x00000200 /* New LWP indicator for ptrace() */
 #define	TDB_EXIT	0x00000400 /* Exiting LWP indicator for ptrace() */
+#define	TDB_VFORK	0x00000800 /* vfork indicator for ptrace() */
 #define	TDB_FSTP	0x00001000 /* The thread is PT_ATTACH leader */
 
 /*
@@ -563,7 +564,7 @@ struct proc {
 	u_int		p_magic;	/* (b) Magic number. */
 	int		p_osrel;	/* (x) osreldate for the
 					       binary (from ELF note, if any) */
-	char		p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
+	char		p_comm[MAXCOMLEN + 1];	/* (x) Process name. */
 	void		*p_pad0;
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
@@ -932,7 +933,7 @@ void	proc_linkup(struct proc *p, struct thread *td);
 struct proc *proc_realparent(struct proc *child);
 void	proc_reap(struct thread *td, struct proc *p, int *status, int options);
 void	proc_reparent(struct proc *child, struct proc *newparent);
-void	proc_set_traced(struct proc *p);
+void	proc_set_traced(struct proc *p, bool stop);
 struct	pstats *pstats_alloc(void);
 void	pstats_fork(struct pstats *src, struct pstats *dst);
 void	pstats_free(struct pstats *ps);
diff --git a/sys/sys/ptrace.h b/sys/sys/ptrace.h
index e2b6a5f..f5f1db2 100644
--- a/sys/sys/ptrace.h
+++ b/sys/sys/ptrace.h
@@ -89,6 +89,7 @@
 #define	PTRACE_SYSCALL	(PTRACE_SCE | PTRACE_SCX)
 #define	PTRACE_FORK	0x0008
 #define	PTRACE_LWP	0x0010
+#define	PTRACE_VFORK	0x0020
 
 #define	PTRACE_DEFAULT	(PTRACE_EXEC)
 
@@ -124,6 +125,8 @@ struct ptrace_lwpinfo {
 #define	PL_FLAG_CHILD	0x80	/* I am from child */
 #define	PL_FLAG_BORN	0x100	/* new LWP */
 #define	PL_FLAG_EXITED	0x200	/* exiting LWP */
+#define	PL_FLAG_VFORKED	0x400	/* new child via vfork */
+#define	PL_FLAG_VFORK_DONE 0x800 /* vfork parent has resumed */
 	sigset_t	pl_sigmask;	/* LWP signal mask */
 	sigset_t	pl_siglist;	/* LWP pending signal */
 	struct __siginfo pl_siginfo;	/* siginfo for signal */
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index 8551085..04e0ae9 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -255,6 +255,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -309,7 +311,7 @@ retry:
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -325,6 +327,8 @@ retry:
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
@@ -386,7 +390,7 @@ retry:
 		    flags | IO_BUFLOCKED, cred, &newb);
 		if (error) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -401,6 +405,8 @@ retry:
 			goto fail;
 		}
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -478,10 +484,16 @@ fail:
 		 * We shall not leave the freed blocks on the vnode
 		 * buffer object lists.
 		 */
-		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
-			bp->b_flags |= (B_INVAL | B_RELBUF);
-			bp->b_flags &= ~B_ASYNC;
+			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
+			    ("mismatch1 l %jd %jd b %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
+			    (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp)));
+			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
+			bp->b_flags &= ~(B_ASYNC | B_CACHE);
 			brelse(bp);
 		}
 		deallocated += fs->fs_bsize;
@@ -524,6 +536,18 @@ fail:
 	 * cleared, free the blocks.
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
+#ifdef INVARIANTS
+		if (blkp == allociblk)
+			lbns_remfree = lbns;
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
+		if (bp != NULL) {
+			panic("zombie1 %jd %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp));
+		}
+		lbns_remfree++;
+#endif
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
@@ -818,6 +842,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
@@ -873,7 +899,7 @@ retry:
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -889,6 +915,8 @@ retry:
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
@@ -951,7 +979,7 @@ retry:
 		    flags | IO_BUFLOCKED, cred, &newb);
 		if (error) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -966,6 +994,8 @@ retry:
 			goto fail;
 		}
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -1049,10 +1079,16 @@ fail:
 		 * We shall not leave the freed blocks on the vnode
 		 * buffer object lists.
 		 */
-		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
-			bp->b_flags |= (B_INVAL | B_RELBUF);
-			bp->b_flags &= ~B_ASYNC;
+			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
+			    ("mismatch2 l %jd %jd b %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
+			    (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp)));
+			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
+			bp->b_flags &= ~(B_ASYNC | B_CACHE);
 			brelse(bp);
 		}
 		deallocated += fs->fs_bsize;
@@ -1095,6 +1131,18 @@ fail:
 	 * cleared, free the blocks.
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
+#ifdef INVARIANTS
+		if (blkp == allociblk)
+			lbns_remfree = lbns;
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
+		if (bp != NULL) {
+			panic("zombie2 %jd %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp));
+		}
+		lbns_remfree++;
+#endif
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
diff --git a/tests/sys/kern/ptrace_test.c b/tests/sys/kern/ptrace_test.c
index 44ab15b..e6c5208 100644
--- a/tests/sys/kern/ptrace_test.c
+++ b/tests/sys/kern/ptrace_test.c
@@ -1463,6 +1463,130 @@ ATF_TC_BODY(ptrace__event_mask, tc)
 	ATF_REQUIRE(errno == ECHILD);
 }
 
+/*
+ * Verify that the expected ptrace events are reported for PTRACE_VFORK.
+ */
+ATF_TC_WITHOUT_HEAD(ptrace__ptrace_vfork);
+ATF_TC_BODY(ptrace__ptrace_vfork, tc)
+{
+	struct ptrace_lwpinfo pl;
+	pid_t fpid, wpid;
+	int events, status;
+
+	ATF_REQUIRE((fpid = fork()) != -1);
+	if (fpid == 0) {
+		trace_me();
+		follow_fork_parent(true);
+	}
+
+	/* The first wait() should report the stop from SIGSTOP. */
+	wpid = waitpid(fpid, &status, 0);
+	ATF_REQUIRE(wpid == fpid);
+	ATF_REQUIRE(WIFSTOPPED(status));
+	ATF_REQUIRE(WSTOPSIG(status) == SIGSTOP);
+
+	ATF_REQUIRE(ptrace(PT_GET_EVENT_MASK, fpid, (caddr_t)&events,
+	    sizeof(events)) == 0);
+	events |= PTRACE_VFORK;
+	ATF_REQUIRE(ptrace(PT_SET_EVENT_MASK, fpid, (caddr_t)&events,
+	    sizeof(events)) == 0);
+	
+	/* Continue the child ignoring the SIGSTOP. */
+	ATF_REQUIRE(ptrace(PT_CONTINUE, fpid, (caddr_t)1, 0) != -1);
+
+	/* The next event should report the end of the vfork. */
+	wpid = wait(&status);
+	ATF_REQUIRE(wpid == fpid);
+	ATF_REQUIRE(WIFSTOPPED(status));
+	ATF_REQUIRE(WSTOPSIG(status) == SIGTRAP);
+	ATF_REQUIRE(ptrace(PT_LWPINFO, wpid, (caddr_t)&pl, sizeof(pl)) != -1);
+	ATF_REQUIRE((pl.pl_flags & PL_FLAG_VFORK_DONE) != 0);
+
+	ATF_REQUIRE(ptrace(PT_CONTINUE, fpid, (caddr_t)1, 0) != -1);
+
+	wpid = wait(&status);
+	ATF_REQUIRE(wpid == fpid);
+	ATF_REQUIRE(WIFEXITED(status));
+	ATF_REQUIRE(WEXITSTATUS(status) == 1);
+
+	wpid = wait(&status);
+	ATF_REQUIRE(wpid == -1);
+	ATF_REQUIRE(errno == ECHILD);
+}
+
+ATF_TC_WITHOUT_HEAD(ptrace__ptrace_vfork_follow);
+ATF_TC_BODY(ptrace__ptrace_vfork_follow, tc)
+{
+	struct ptrace_lwpinfo pl[2];
+	pid_t children[2], fpid, wpid;
+	int events, status;
+
+	ATF_REQUIRE((fpid = fork()) != -1);
+	if (fpid == 0) {
+		trace_me();
+		follow_fork_parent(true);
+	}
+
+	/* Parent process. */
+	children[0] = fpid;
+
+	/* The first wait() should report the stop from SIGSTOP. */
+	wpid = waitpid(children[0], &status, 0);
+	ATF_REQUIRE(wpid == children[0]);
+	ATF_REQUIRE(WIFSTOPPED(status));
+	ATF_REQUIRE(WSTOPSIG(status) == SIGSTOP);
+
+	ATF_REQUIRE(ptrace(PT_GET_EVENT_MASK, children[0], (caddr_t)&events,
+	    sizeof(events)) == 0);
+	events |= PTRACE_FORK | PTRACE_VFORK;
+	ATF_REQUIRE(ptrace(PT_SET_EVENT_MASK, children[0], (caddr_t)&events,
+	    sizeof(events)) == 0);
+
+	/* Continue the child ignoring the SIGSTOP. */
+	ATF_REQUIRE(ptrace(PT_CONTINUE, children[0], (caddr_t)1, 0) != -1);
+
+	/* Wait for both halves of the fork event to get reported. */
+	children[1] = handle_fork_events(children[0], pl);
+	ATF_REQUIRE(children[1] > 0);
+
+	ATF_REQUIRE((pl[0].pl_flags & PL_FLAG_VFORKED) != 0);
+
+	ATF_REQUIRE(ptrace(PT_CONTINUE, children[0], (caddr_t)1, 0) != -1);
+	ATF_REQUIRE(ptrace(PT_CONTINUE, children[1], (caddr_t)1, 0) != -1);
+
+	/*
+	 * The child can't exit until the grandchild reports status, so the
+	 * grandchild should report its exit first to the debugger.
+	 */
+	wpid = waitpid(children[1], &status, 0);
+	ATF_REQUIRE(wpid == children[1]);
+	ATF_REQUIRE(WIFEXITED(status));
+	ATF_REQUIRE(WEXITSTATUS(status) == 2);
+
+	/*
+	 * The child should report it's vfork() completion before it
+	 * exits.
+	 */
+	wpid = wait(&status);
+	ATF_REQUIRE(wpid == children[0]);
+	ATF_REQUIRE(WIFSTOPPED(status));
+	ATF_REQUIRE(WSTOPSIG(status) == SIGTRAP);
+	ATF_REQUIRE(ptrace(PT_LWPINFO, wpid, (caddr_t)&pl[0], sizeof(pl[0])) !=
+	    -1);
+	ATF_REQUIRE((pl[0].pl_flags & PL_FLAG_VFORK_DONE) != 0);
+
+	ATF_REQUIRE(ptrace(PT_CONTINUE, children[0], (caddr_t)1, 0) != -1);
+
+	wpid = wait(&status);
+	ATF_REQUIRE(wpid == children[0]);
+	ATF_REQUIRE(WIFEXITED(status));
+	ATF_REQUIRE(WEXITSTATUS(status) == 1);
+
+	wpid = wait(&status);
+	ATF_REQUIRE(wpid == -1);
+	ATF_REQUIRE(errno == ECHILD);
+}
+
 ATF_TP_ADD_TCS(tp)
 {
 
@@ -1487,6 +1611,8 @@ ATF_TP_ADD_TCS(tp)
 	ATF_TP_ADD_TC(tp, ptrace__ptrace_exec_disable);
 	ATF_TP_ADD_TC(tp, ptrace__ptrace_exec_enable);
 	ATF_TP_ADD_TC(tp, ptrace__event_mask);
+	ATF_TP_ADD_TC(tp, ptrace__ptrace_vfork);
+	ATF_TP_ADD_TC(tp, ptrace__ptrace_vfork_follow);
 
 	return (atf_no_error());
 }
diff --git a/usr.bin/xinstall/xinstall.c b/usr.bin/xinstall/xinstall.c
index ae2ab79..c4397a6 100644
--- a/usr.bin/xinstall/xinstall.c
+++ b/usr.bin/xinstall/xinstall.c
@@ -151,6 +151,7 @@ main(int argc, char *argv[])
 	char *p;
 	const char *to_name;
 
+	fset = 0;
 	iflags = 0;
 	group = owner = NULL;
 	while ((ch = getopt(argc, argv, "B:bCcD:df:g:h:l:M:m:N:o:pSsT:Uv")) !=
@@ -535,7 +536,9 @@ do_link(const char *from_name, const char *to_name,
 			if (target_sb->st_flags & NOCHANGEBITS)
 				(void)chflags(to_name, target_sb->st_flags &
 				     ~NOCHANGEBITS);
-			unlink(to_name);
+			if (verbose)
+				printf("install: link %s -> %s\n",
+				    from_name, to_name);
 			ret = rename(tmpl, to_name);
 			/*
 			 * If rename has posix semantics, then the temporary
@@ -545,8 +548,12 @@ do_link(const char *from_name, const char *to_name,
 			(void)unlink(tmpl);
 		}
 		return (ret);
-	} else
+	} else {
+		if (verbose)
+			printf("install: link %s -> %s\n",
+			    from_name, to_name);
 		return (link(from_name, to_name));
+	}
 }
 
 /*
@@ -575,14 +582,18 @@ do_symlink(const char *from_name, const char *to_name,
 		if (target_sb->st_flags & NOCHANGEBITS)
 			(void)chflags(to_name, target_sb->st_flags &
 			     ~NOCHANGEBITS);
-		unlink(to_name);
-
+		if (verbose)
+			printf("install: symlink %s -> %s\n",
+			    from_name, to_name);
 		if (rename(tmpl, to_name) == -1) {
 			/* Remove temporary link before exiting. */
 			(void)unlink(tmpl);
 			err(EX_OSERR, "%s: rename", to_name);
 		}
 	} else {
+		if (verbose)
+			printf("install: symlink %s -> %s\n",
+			    from_name, to_name);
 		if (symlink(from_name, to_name) == -1)
 			err(EX_OSERR, "symlink %s -> %s", from_name, to_name);
 	}
@@ -882,11 +893,21 @@ install(const char *from_name, const char *to_name, u_long fset, u_int flags)
 			}
 			if (verbose)
 				(void)printf("install: %s -> %s\n", to_name, backup);
-			if (rename(to_name, backup) < 0) {
+			if (unlink(backup) < 0 && errno != ENOENT) {
+				serrno = errno;
+				if (to_sb.st_flags & NOCHANGEBITS)
+					(void)chflags(to_name, to_sb.st_flags);
+				unlink(tempfile);
+				errno = serrno;
+				err(EX_OSERR, "unlink: %s", backup);
+			}
+			if (link(to_name, backup) < 0) {
 				serrno = errno;
 				unlink(tempfile);
+				if (to_sb.st_flags & NOCHANGEBITS)
+					(void)chflags(to_name, to_sb.st_flags);
 				errno = serrno;
-				err(EX_OSERR, "rename: %s to %s", to_name,
+				err(EX_OSERR, "link: %s to %s", to_name,
 				     backup);
 			}
 		}
@@ -1109,16 +1130,26 @@ create_newfile(const char *path, int target, struct stat *sbp)
 
 		if (dobackup) {
 			if ((size_t)snprintf(backup, MAXPATHLEN, "%s%s",
-			    path, suffix) != strlen(path) + strlen(suffix))
+			    path, suffix) != strlen(path) + strlen(suffix)) {
+				saved_errno = errno;
+				if (sbp->st_flags & NOCHANGEBITS)
+					(void)chflags(path, sbp->st_flags);
+				errno = saved_errno;
 				errx(EX_OSERR, "%s: backup filename too long",
 				    path);
+			}
 			(void)snprintf(backup, MAXPATHLEN, "%s%s",
 			    path, suffix);
 			if (verbose)
 				(void)printf("install: %s -> %s\n",
 				    path, backup);
-			if (rename(path, backup) < 0)
+			if (rename(path, backup) < 0) {
+				saved_errno = errno;
+				if (sbp->st_flags & NOCHANGEBITS)
+					(void)chflags(path, sbp->st_flags);
+				errno = saved_errno;
 				err(EX_OSERR, "rename: %s to %s", path, backup);
+			}
 		} else
 			if (unlink(path) < 0)
 				saved_errno = errno;
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 33635f5..7eb692c 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -8,6 +8,8 @@ DEBUG_FLAGS= -g -O0
 
 MAN=	bhyve.8
 
+BHYVE_SYSDIR?=${SRCTOP}
+
 SRCS=	\
 	atkbdc.c		\
 	acpi.c			\
@@ -23,6 +25,7 @@ SRCS=	\
 	mevent.c		\
 	mptbl.c			\
 	pci_ahci.c		\
+	pci_e82545.c		\
 	pci_emul.c		\
 	pci_hostbridge.c	\
 	pci_irq.c		\
@@ -42,12 +45,16 @@ SRCS=	\
 	xmsr.c			\
 	spinup_ap.c
 
-.PATH:	${.CURDIR}/../../sys/amd64/vmm
+.PATH:  ${BHYVE_SYSDIR}/sys/amd64/vmm
 SRCS+=	vmm_instruction_emul.c
 
 DPADD=	${LIBVMMAPI} ${LIBMD} ${LIBUTIL} ${LIBPTHREAD}
 LDADD=	-lvmmapi -lmd -lutil -lpthread
 
+CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000
+CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii
+CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller
+
 WARNS?=	2
 
 .include <bsd.prog.mk>
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 6509fe7..20dc04f 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 7, 2015
+.Dd July 9, 2016
 .Dt BHYVE 8
 .Os
 .Sh NAME
@@ -168,10 +168,14 @@ Virtio network interface.
 Virtio block storage interface.
 .It Li virtio-rnd
 Virtio RNG interface.
+.It Li ahci
+AHCI controller attached to arbitraty devices.
 .It Li ahci-cd
 AHCI controller attached to an ATAPI CD/DVD.
 .It Li ahci-hd
 AHCI controller attached to a SATA hard-drive.
+.It Li e1000
+Intel e82545 network interface.
 .It Li uart
 PCI 16550 serial device.
 .It Li lpc
@@ -323,15 +327,11 @@ null-modem device.
 .Bd -literal -offset indent
 bhyve -c 4 \\
   -s 0,amd_hostbridge -s 1,lpc \\
-  -s 1:0,ahci-hd,/images/disk.1 \\
-  -s 1:1,ahci-hd,/images/disk.2 \\
-  -s 1:2,ahci-hd,/images/disk.3 \\
-  -s 1:3,ahci-hd,/images/disk.4 \\
-  -s 1:4,ahci-hd,/images/disk.5 \\
-  -s 1:5,ahci-hd,/images/disk.6 \\
-  -s 1:6,ahci-hd,/images/disk.7 \\
-  -s 1:7,ahci-hd,/images/disk.8 \\
-  -s 2,ahci-cd,/images/install.iso \\
+  -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\
+hd:/images/disk.3,hd:/images/disk.4,\\
+hd:/images/disk.5,hd:/images/disk.6,\\
+hd:/images/disk.7,hd:/images/disk.8,\\
+cd:/images/install.iso \\
   -s 3,virtio-net,tap0 \\
   -l com1,/dev/nmdm0A \\
   -A -H -P -m 8G
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index cec2cd7..1cc9594 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2013  Zhixiang Yu <zcore@freebsd.org>
+ * Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,7 +58,8 @@ __FBSDID("$FreeBSD$");
 #include "ahci.h"
 #include "block_if.h"
 
-#define	MAX_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
+#define	DEF_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
+#define	MAX_PORTS	32	/* AHCI supports 32 ports */
 
 #define	PxSIG_ATA	0x00000101 /* ATA drive */
 #define	PxSIG_ATAPI	0xeb140101 /* ATAPI drive */
@@ -133,6 +135,7 @@ struct ahci_port {
 	uint8_t *cmd_lst;
 	uint8_t *rfis;
 	char ident[20 + 1];
+	int port;
 	int atapi;
 	int reset;
 	int waitforclear;
@@ -217,47 +220,95 @@ static inline void lba_to_msf(uint8_t *buf, int lba)
 }
 
 /*
- * generate HBA intr depending on whether or not ports within
- * the controller have an interrupt pending.
+ * Generate HBA interrupts on global IS register write.
  */
 static void
-ahci_generate_intr(struct pci_ahci_softc *sc)
+ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask)
 {
-	struct pci_devinst *pi;
-	int i;
-
-	pi = sc->asc_pi;
+	struct pci_devinst *pi = sc->asc_pi;
+	struct ahci_port *p;
+	int i, nmsg;
+	uint32_t mmask;
 
+	/* Update global IS from PxIS/PxIE. */
 	for (i = 0; i < sc->ports; i++) {
-		struct ahci_port *pr;
-		pr = &sc->port[i];
-		if (pr->is & pr->ie)
+		p = &sc->port[i];
+		if (p->is & p->ie)
 			sc->is |= (1 << i);
 	}
+	DPRINTF("%s(%08x) %08x\n", __func__, mask, sc->is);
 
-	DPRINTF("%s %x\n", __func__, sc->is);
-
-	if (sc->is && (sc->ghc & AHCI_GHC_IE)) {		
-		if (pci_msi_enabled(pi)) {
-			/*
-			 * Generate an MSI interrupt on every edge
-			 */
-			pci_generate_msi(pi, 0);
-		} else if (!sc->lintr) {
-			/*
-			 * Only generate a pin-based interrupt if one wasn't
-			 * in progress
-			 */
+	/* If there is nothing enabled -- clear legacy interrupt and exit. */
+	if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) {
+		if (sc->lintr) {
+			pci_lintr_deassert(pi);
+			sc->lintr = 0;
+		}
+		return;
+	}
+
+	/* If there is anything and no MSI -- assert legacy interrupt. */
+	nmsg = pci_msi_maxmsgnum(pi);
+	if (nmsg == 0) {
+		if (!sc->lintr) {
 			sc->lintr = 1;
 			pci_lintr_assert(pi);
 		}
-	} else if (sc->lintr) {
-		/*
-		 * No interrupts: deassert pin-based signal if it had
-		 * been asserted
-		 */
-		pci_lintr_deassert(pi);
-		sc->lintr = 0;
+		return;
+	}
+
+	/* Assert respective MSIs for ports that were touched. */
+	for (i = 0; i < nmsg; i++) {
+		if (sc->ports <= nmsg || i < nmsg - 1)
+			mmask = 1 << i;
+		else
+			mmask = 0xffffffff << i;
+		if (sc->is & mask && mmask & mask)
+			pci_generate_msi(pi, i);
+	}
+}
+
+/*
+ * Generate HBA interrupt on specific port event.
+ */
+static void
+ahci_port_intr(struct ahci_port *p)
+{
+	struct pci_ahci_softc *sc = p->pr_sc;
+	struct pci_devinst *pi = sc->asc_pi;
+	int nmsg;
+
+	DPRINTF("%s(%d) %08x/%08x %08x\n", __func__,
+	    p->port, p->is, p->ie, sc->is);
+
+	/* If there is nothing enabled -- we are done. */
+	if ((p->is & p->ie) == 0)
+		return;
+
+	/* In case of non-shared MSI always generate interrupt. */
+	nmsg = pci_msi_maxmsgnum(pi);
+	if (sc->ports <= nmsg || p->port < nmsg - 1) {
+		sc->is |= (1 << p->port);
+		if ((sc->ghc & AHCI_GHC_IE) == 0)
+			return;
+		pci_generate_msi(pi, p->port);
+		return;
+	}
+
+	/* If IS for this port is already set -- do nothing. */
+	if (sc->is & (1 << p->port))
+		return;
+
+	sc->is |= (1 << p->port);
+
+	/* If interrupts are enabled -- generate one. */
+	if ((sc->ghc & AHCI_GHC_IE) == 0)
+		return;
+	if (nmsg > 0) {
+		pci_generate_msi(pi, nmsg - 1);
+	} else if (!sc->lintr) {
+		sc->lintr = 1;
+		pci_lintr_assert(pi);
 	}
 }
 
@@ -295,8 +346,10 @@ ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
 	}
 	memcpy(p->rfis + offset, fis, len);
 	if (irq) {
-		p->is |= irq;
-		ahci_generate_intr(p->pr_sc);
+		if (~p->is & irq) {
+			p->is |= irq;
+			ahci_port_intr(p);
+		}
 	}
 }
 
@@ -1738,7 +1791,7 @@ ahci_handle_slot(struct ahci_port *p, int slot)
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 #ifdef AHCI_DEBUG
-	int cfl;
+	int cfl, i;
 #endif
 
 	sc = p->pr_sc;
@@ -1999,10 +2052,11 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 		break;
 	case AHCI_P_IS:
 		p->is &= ~value;
+		ahci_port_intr(p);
 		break;
 	case AHCI_P_IE:
 		p->ie = value & 0xFDC000FF;
-		ahci_generate_intr(sc);
+		ahci_port_intr(p);
 		break;
 	case AHCI_P_CMD:
 	{
@@ -2092,16 +2146,19 @@ pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 		DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset);
 		break;
 	case AHCI_GHC:
-		if (value & AHCI_GHC_HR)
+		if (value & AHCI_GHC_HR) {
 			ahci_reset(sc);
-		else if (value & AHCI_GHC_IE) {
-			sc->ghc |= AHCI_GHC_IE;
-			ahci_generate_intr(sc);
+			break;
 		}
+		if (value & AHCI_GHC_IE)
+			sc->ghc |= AHCI_GHC_IE;
+		else
+			sc->ghc &= ~AHCI_GHC_IE;
+		ahci_generate_intr(sc, 0xffffffff);
 		break;
 	case AHCI_IS:
 		sc->is &= ~value;
-		ahci_generate_intr(sc);
+		ahci_generate_intr(sc, value);
 		break;
 	default:
 		break;
@@ -2236,20 +2293,16 @@ pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 static int
 pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 {
-	char bident[sizeof("XX:X:X")];
+	char bident[sizeof("XX:XX:XX")];
 	struct blockif_ctxt *bctxt;
 	struct pci_ahci_softc *sc;
-	int ret, slots;
+	int ret, slots, p;
 	MD5_CTX mdctx;
 	u_char digest[16];
+	char *next, *next2;
 
 	ret = 0;
 
-	if (opts == NULL) {
-		fprintf(stderr, "pci_ahci: backing device required\n");
-		return (1);
-	}
-
 #ifdef AHCI_DEBUG
 	dbg = fopen("/tmp/log", "w+");
 #endif
@@ -2257,58 +2310,84 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 	sc = calloc(1, sizeof(struct pci_ahci_softc));
 	pi->pi_arg = sc;
 	sc->asc_pi = pi;
-	sc->ports = MAX_PORTS;
+	pthread_mutex_init(&sc->mtx, NULL);
+	sc->ports = 0;
+	sc->pi = 0;
+	slots = 32;
+
+	for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) {
+		/* Identify and cut off type of present port. */
+		if (strncmp(opts, "hd:", 3) == 0) {
+			atapi = 0;
+			opts += 3;
+		} else if (strncmp(opts, "cd:", 3) == 0) {
+			atapi = 1;
+			opts += 3;
+		}
 
-	/*
-	 * Only use port 0 for a backing device. All other ports will be
-	 * marked as unused
-	 */
-	sc->port[0].atapi = atapi;
+		/* Find and cut off the next port options. */
+		next = strstr(opts, ",hd:");
+		next2 = strstr(opts, ",cd:");
+		if (next == NULL || (next2 != NULL && next2 < next))
+			next = next2;
+		if (next != NULL) {
+			next[0] = 0;
+			next++;
+		}
 
-	/*
-	 * Attempt to open the backing image. Use the PCI
-	 * slot/func for the identifier string.
-	 */
-	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
-	bctxt = blockif_open(opts, bident);
-	if (bctxt == NULL) {       	
-		ret = 1;
-		goto open_fail;
-	}	
-	sc->port[0].bctx = bctxt;
-	sc->port[0].pr_sc = sc;
+		if (opts[0] == 0)
+			continue;
 
-	/*
-	 * Create an identifier for the backing file. Use parts of the
-	 * md5 sum of the filename
-	 */
-	MD5Init(&mdctx);
-	MD5Update(&mdctx, opts, strlen(opts));
-	MD5Final(digest, &mdctx);	
-	sprintf(sc->port[0].ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
-	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
+		/*
+		 * Attempt to open the backing image. Use the PCI slot/func
+		 * and the port number for the identifier string.
+		 */
+		snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot,
+		    pi->pi_func, p);
+		bctxt = blockif_open(opts, bident);
+		if (bctxt == NULL) {
+			sc->ports = p;
+			ret = 1;
+			goto open_fail;
+		}	
+		sc->port[p].bctx = bctxt;
+		sc->port[p].pr_sc = sc;
+		sc->port[p].port = p;
+		sc->port[p].atapi = atapi;
 
-	/*
-	 * Allocate blockif request structures and add them
-	 * to the free list
-	 */
-	pci_ahci_ioreq_init(&sc->port[0]);
+		/*
+		 * Create an identifier for the backing file.
+		 * Use parts of the md5 sum of the filename
+		 */
+		MD5Init(&mdctx);
+		MD5Update(&mdctx, opts, strlen(opts));
+		MD5Final(digest, &mdctx);
+		sprintf(sc->port[p].ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+		    digest[0], digest[1], digest[2], digest[3], digest[4],
+		    digest[5]);
 
-	pthread_mutex_init(&sc->mtx, NULL);
+		/*
+		 * Allocate blockif request structures and add them
+		 * to the free list
+		 */
+		pci_ahci_ioreq_init(&sc->port[p]);
+
+		sc->pi |= (1 << p);
+		if (sc->port[p].ioqsz < slots)
+			slots = sc->port[p].ioqsz;
+	}
+	sc->ports = p;
 
 	/* Intel ICH8 AHCI */
-	slots = sc->port[0].ioqsz;
-	if (slots > 32)
-		slots = 32;
 	--slots;
+	if (sc->ports < DEF_PORTS)
+		sc->ports = DEF_PORTS;
 	sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
 	    AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
 	    AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
 	    AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
 	    (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
 
-	/* Only port 0 implemented */
-	sc->pi = 1;
 	sc->vs = 0x10300;
 	sc->cap2 = AHCI_CAP2_APST;
 	ahci_reset(sc);
@@ -2318,7 +2397,9 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
 	pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
-	pci_emul_add_msicap(pi, 1);
+	p = MIN(sc->ports, 16);
+	p = flsl(p) - ((p & (p - 1)) ? 0 : 1);
+	pci_emul_add_msicap(pi, 1 << p);
 	pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
 	    AHCI_OFFSET + sc->ports * AHCI_STEP);
 
@@ -2326,8 +2407,10 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 
 open_fail:
 	if (ret) {
-		if (sc->port[0].bctx != NULL)
-			blockif_close(sc->port[0].bctx);
+		for (p = 0; p < sc->ports; p++) {
+			if (sc->port[p].bctx != NULL)
+				blockif_close(sc->port[p].bctx);
+		}
 		free(sc);
 	}
 
@@ -2351,6 +2434,14 @@ pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 /*
  * Use separate emulation names to distinguish drive and atapi devices
  */
+struct pci_devemu pci_de_ahci = {
+	.pe_emu =	"ahci",
+	.pe_init =	pci_ahci_hd_init,
+	.pe_barwrite =	pci_ahci_write,
+	.pe_barread =	pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci);
+
 struct pci_devemu pci_de_ahci_hd = {
 	.pe_emu =	"ahci-hd",
 	.pe_init =	pci_ahci_hd_init,
diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c
new file mode 100644
index 0000000..03a324e
--- /dev/null
+++ b/usr.sbin/bhyve/pci_e82545.c
@@ -0,0 +1,2372 @@
+/*
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
+ * Copyright (c) 2013 Jeremiah Lott, Avere Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/limits.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <md5.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "e1000_regs.h"
+#include "e1000_defines.h"
+#include "mii.h"
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+
+/* Hardware/register definitions XXX: move some to common code. */
+#define E82545_VENDOR_ID_INTEL			0x8086
+#define E82545_DEV_ID_82545EM_COPPER		0x100F
+#define E82545_SUBDEV_ID			0x1008
+
+#define E82545_REVISION_4			4
+
+#define E82545_MDIC_DATA_MASK			0x0000FFFF
+#define E82545_MDIC_OP_MASK			0x0c000000
+#define E82545_MDIC_IE				0x20000000
+
+#define E82545_EECD_FWE_DIS	0x00000010 /* Flash writes disabled */
+#define E82545_EECD_FWE_EN	0x00000020 /* Flash writes enabled */
+#define E82545_EECD_FWE_MASK	0x00000030 /* Flash writes mask */
+
+#define E82545_BAR_REGISTER			0
+#define E82545_BAR_REGISTER_LEN			(128*1024)
+#define E82545_BAR_FLASH			1
+#define E82545_BAR_FLASH_LEN			(64*1024)
+#define E82545_BAR_IO				2
+#define E82545_BAR_IO_LEN			8
+
+#define E82545_IOADDR				0x00000000
+#define E82545_IODATA				0x00000004
+#define E82545_IO_REGISTER_MAX			0x0001FFFF
+#define E82545_IO_FLASH_BASE			0x00080000
+#define E82545_IO_FLASH_MAX			0x000FFFFF
+
+#define E82545_ARRAY_ENTRY(reg, offset)		(reg + (offset<<2))
+#define E82545_RAR_MAX				15
+#define E82545_MTA_MAX				127
+#define E82545_VFTA_MAX				127
+
+/* Slightly modified from the driver versions, hardcoded for 3 opcode bits,
+ * followed by 6 address bits.
+ * TODO: make opcode bits and addr bits configurable?
+ * NVM Commands - Microwire */
+#define E82545_NVM_OPCODE_BITS	3
+#define E82545_NVM_ADDR_BITS	6
+#define E82545_NVM_DATA_BITS	16
+#define E82545_NVM_OPADDR_BITS	(E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS)
+#define E82545_NVM_ADDR_MASK	((1 << E82545_NVM_ADDR_BITS)-1)
+#define E82545_NVM_OPCODE_MASK	\
+    (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS)
+#define E82545_NVM_OPCODE_READ	(0x6 << E82545_NVM_ADDR_BITS)	/* read */
+#define E82545_NVM_OPCODE_WRITE	(0x5 << E82545_NVM_ADDR_BITS)	/* write */
+#define E82545_NVM_OPCODE_ERASE	(0x7 << E82545_NVM_ADDR_BITS)	/* erase */
+#define	E82545_NVM_OPCODE_EWEN	(0x4 << E82545_NVM_ADDR_BITS)	/* wr-enable */
+
+#define	E82545_NVM_EEPROM_SIZE	64 /* 64 * 16-bit values == 128K */
+
+#define E1000_ICR_SRPD		0x00010000
+
+/* This is an arbitrary number.  There is no hard limit on the chip. */
+#define I82545_MAX_TXSEGS	64
+
+/* Legacy receive descriptor */
+struct e1000_rx_desc {
+	uint64_t buffer_addr;	/* Address of the descriptor's data buffer */
+	uint16_t length;	/* Length of data DMAed into data buffer */
+	uint16_t csum;		/* Packet checksum */
+	uint8_t	 status;       	/* Descriptor status */
+	uint8_t  errors;	/* Descriptor Errors */
+	uint16_t special;
+};
+
+/* Transmit descriptor types */
+#define	E1000_TXD_MASK		(E1000_TXD_CMD_DEXT | 0x00F00000)
+#define E1000_TXD_TYP_L		(0)
+#define E1000_TXD_TYP_C		(E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C)
+#define E1000_TXD_TYP_D		(E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)
+
+/* Legacy transmit descriptor */
+struct e1000_tx_desc {
+	uint64_t buffer_addr;   /* Address of the descriptor's data buffer */
+	union {
+		uint32_t data;
+		struct {
+			uint16_t length;  /* Data buffer length */
+			uint8_t  cso;  /* Checksum offset */
+			uint8_t  cmd;  /* Descriptor control */
+		} flags;
+	} lower;
+	union {
+		uint32_t data;
+		struct {
+			uint8_t status; /* Descriptor status */
+			uint8_t css;  /* Checksum start */
+			uint16_t special;
+		} fields;
+	} upper;
+};
+
+/* Context descriptor */
+struct e1000_context_desc {
+	union {
+		uint32_t ip_config;
+		struct {
+			uint8_t ipcss;  /* IP checksum start */
+			uint8_t ipcso;  /* IP checksum offset */
+			uint16_t ipcse;  /* IP checksum end */
+		} ip_fields;
+	} lower_setup;
+	union {
+		uint32_t tcp_config;
+		struct {
+			uint8_t tucss;  /* TCP checksum start */
+			uint8_t tucso;  /* TCP checksum offset */
+			uint16_t tucse;  /* TCP checksum end */
+		} tcp_fields;
+	} upper_setup;
+	uint32_t cmd_and_length;
+	union {
+		uint32_t data;
+		struct {
+			uint8_t status;  /* Descriptor status */
+			uint8_t hdr_len;  /* Header length */
+			uint16_t mss;  /* Maximum segment size */
+		} fields;
+	} tcp_seg_setup;
+};
+
+/* Data descriptor */
+struct e1000_data_desc {
+	uint64_t buffer_addr;  /* Address of the descriptor's buffer address */
+	union {
+		uint32_t data;
+		struct {
+			uint16_t length;  /* Data buffer length */
+			uint8_t typ_len_ext;
+			uint8_t cmd;
+		} flags;
+	} lower;
+	union {
+		uint32_t data;
+		struct {
+			uint8_t status;  /* Descriptor status */
+			uint8_t popts;  /* Packet Options */
+			uint16_t special;
+		} fields;
+	} upper;
+};
+
+union e1000_tx_udesc {
+	struct e1000_tx_desc td;
+	struct e1000_context_desc cd;
+	struct e1000_data_desc dd;
+};
+
+/* Tx checksum info for a packet. */
+struct ck_info {
+	int	ck_valid;	/* ck_info is valid */
+	uint8_t	ck_start;	/* start byte of cksum calcuation */
+	uint8_t	ck_off;		/* offset of cksum insertion */
+	uint16_t ck_len;	/* length of cksum calc: 0 is to packet-end */
+};
+
+/*
+ * Debug printf
+ */
+static int e82545_debug = 0;
+#define DPRINTF(msg,params...) if (e82545_debug) fprintf(stderr, "e82545: " msg, params)
+#define WPRINTF(msg,params...) fprintf(stderr, "e82545: " msg, params)
+
+#define	MIN(a,b) (((a)<(b))?(a):(b))
+#define	MAX(a,b) (((a)>(b))?(a):(b))
+
+/* s/w representation of the RAL/RAH regs */
+struct  eth_uni {
+	int		eu_valid;
+	int		eu_addrsel;
+	struct ether_addr eu_eth;
+};
+
+
+struct e82545_softc {
+	struct pci_devinst *esc_pi;
+	struct vmctx	*esc_ctx;
+	struct mevent   *esc_mevp;
+	struct mevent   *esc_mevpitr;
+	pthread_mutex_t	esc_mtx;
+	struct ether_addr esc_mac;
+	int		esc_tapfd;
+
+	/* General */
+	uint32_t	esc_CTRL;	/* x0000 device ctl */
+	uint32_t	esc_FCAL;	/* x0028 flow ctl addr lo */
+	uint32_t	esc_FCAH;	/* x002C flow ctl addr hi */
+	uint32_t	esc_FCT;	/* x0030 flow ctl type */
+	uint32_t	esc_VET;	/* x0038 VLAN eth type */
+	uint32_t	esc_FCTTV;	/* x0170 flow ctl tx timer */
+	uint32_t	esc_LEDCTL;	/* x0E00 LED control */
+	uint32_t	esc_PBA;	/* x1000 pkt buffer allocation */
+	
+	/* Interrupt control */
+	int		esc_irq_asserted;
+	uint32_t	esc_ICR;	/* x00C0 cause read/clear */
+	uint32_t	esc_ITR;	/* x00C4 intr throttling */
+	uint32_t	esc_ICS;	/* x00C8 cause set */
+	uint32_t	esc_IMS;	/* x00D0 mask set/read */
+	uint32_t	esc_IMC;	/* x00D8 mask clear */
+
+	/* Transmit */
+	union e1000_tx_udesc *esc_txdesc;
+	struct e1000_context_desc esc_txctx;
+	pthread_t	esc_tx_tid;
+	pthread_cond_t	esc_tx_cond;
+	int		esc_tx_enabled;
+	int		esc_tx_active;
+	uint32_t	esc_TXCW;	/* x0178 transmit config */
+	uint32_t	esc_TCTL;	/* x0400 transmit ctl */
+	uint32_t	esc_TIPG;	/* x0410 inter-packet gap */
+	uint16_t	esc_AIT;	/* x0458 Adaptive Interframe Throttle */
+	uint64_t	esc_tdba;      	/* verified 64-bit desc table addr */
+	uint32_t	esc_TDBAL;	/* x3800 desc table addr, low bits */
+	uint32_t	esc_TDBAH;	/* x3804 desc table addr, hi 32-bits */
+	uint32_t	esc_TDLEN;	/* x3808 # descriptors in bytes */
+	uint16_t	esc_TDH;	/* x3810 desc table head idx */
+	uint16_t	esc_TDHr;	/* internal read version of TDH */
+	uint16_t	esc_TDT;	/* x3818 desc table tail idx */
+	uint32_t	esc_TIDV;	/* x3820 intr delay */
+	uint32_t	esc_TXDCTL;	/* x3828 desc control */
+	uint32_t	esc_TADV;	/* x382C intr absolute delay */
+	
+	/* L2 frame acceptance */
+	struct eth_uni	esc_uni[16];	/* 16 x unicast MAC addresses */
+	uint32_t	esc_fmcast[128]; /* Multicast filter bit-match */
+	uint32_t	esc_fvlan[128]; /* VLAN 4096-bit filter */
+	
+	/* Receive */
+	struct e1000_rx_desc *esc_rxdesc;
+	pthread_cond_t	esc_rx_cond;
+	int		esc_rx_enabled;
+	int		esc_rx_active;
+	int		esc_rx_loopback;
+	uint32_t	esc_RCTL;	/* x0100 receive ctl */
+	uint32_t	esc_FCRTL;	/* x2160 flow cntl thresh, low */
+	uint32_t	esc_FCRTH;	/* x2168 flow cntl thresh, hi */
+	uint64_t	esc_rdba;	/* verified 64-bit desc table addr */
+	uint32_t	esc_RDBAL;	/* x2800 desc table addr, low bits */
+	uint32_t	esc_RDBAH;	/* x2804 desc table addr, hi 32-bits*/
+	uint32_t	esc_RDLEN;	/* x2808 #descriptors */
+	uint16_t	esc_RDH;	/* x2810 desc table head idx */
+	uint16_t	esc_RDT;	/* x2818 desc table tail idx */
+	uint32_t	esc_RDTR;	/* x2820 intr delay */
+	uint32_t	esc_RXDCTL;	/* x2828 desc control */
+	uint32_t	esc_RADV;	/* x282C intr absolute delay */
+	uint32_t	esc_RSRPD;	/* x2C00 recv small packet detect */
+	uint32_t	esc_RXCSUM;     /* x5000 receive cksum ctl */
+	
+	/* IO Port register access */
+	uint32_t io_addr;
+
+	/* Shadow copy of MDIC */
+	uint32_t mdi_control;
+	/* Shadow copy of EECD */
+	uint32_t eeprom_control;
+	/* Latest NVM in/out */
+	uint16_t nvm_data;
+	uint16_t nvm_opaddr;
+	/* stats */
+	uint32_t missed_pkt_count; /* dropped for no room in rx queue */
+	uint32_t pkt_rx_by_size[6];
+	uint32_t pkt_tx_by_size[6];
+	uint32_t good_pkt_rx_count;
+	uint32_t bcast_pkt_rx_count;
+	uint32_t mcast_pkt_rx_count;
+	uint32_t good_pkt_tx_count;
+	uint32_t bcast_pkt_tx_count;
+	uint32_t mcast_pkt_tx_count;
+	uint32_t oversize_rx_count;
+	uint32_t tso_tx_count;
+	uint64_t good_octets_rx;
+	uint64_t good_octets_tx;
+	uint64_t missed_octets; /* counts missed and oversized */
+
+	uint8_t nvm_bits:6; /* number of bits remaining in/out */
+	uint8_t nvm_mode:2;
+#define E82545_NVM_MODE_OPADDR  0x0
+#define E82545_NVM_MODE_DATAIN  0x1
+#define E82545_NVM_MODE_DATAOUT 0x2
+        /* EEPROM data */
+        uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE];
+};
+
+static void e82545_reset(struct e82545_softc *sc, int dev);
+static void e82545_rx_enable(struct e82545_softc *sc);
+static void e82545_rx_disable(struct e82545_softc *sc);
+static void e82545_tap_callback(int fd, enum ev_type type, void *param);
+static void e82545_tx_start(struct e82545_softc *sc);
+static void e82545_tx_enable(struct e82545_softc *sc);
+static void e82545_tx_disable(struct e82545_softc *sc);
+
+static inline int
+e82545_size_stat_index(uint32_t size)
+{
+	if (size <= 64) {
+		return 0;
+	} else if (size >= 1024) {
+		return 5;
+	} else {
+		/* should be 1-4 */
+		return (ffs(size) - 6);
+	}
+}
+
+static void
+e82545_init_eeprom(struct e82545_softc *sc)
+{
+	uint16_t checksum, i;
+
+        /* mac addr */
+	sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) |
+		(((uint16_t)sc->esc_mac.octet[1]) << 8);
+	sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) |
+		(((uint16_t)sc->esc_mac.octet[3]) << 8);
+	sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) |
+		(((uint16_t)sc->esc_mac.octet[5]) << 8);
+
+	/* pci ids */
+	sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID;
+	sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL;
+	sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER;
+	sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL;
+
+	/* fill in the checksum */
+        checksum = 0;
+	for (i = 0; i < NVM_CHECKSUM_REG; i++) {
+		checksum += sc->eeprom_data[i];
+	}
+	checksum = NVM_SUM - checksum;
+	sc->eeprom_data[NVM_CHECKSUM_REG] = checksum;
+	DPRINTF("eeprom checksum: 0x%x\r\n", checksum);
+}
+
+static void
+e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr,
+			uint8_t phy_addr, uint32_t data)
+{
+	DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x\r\n", reg_addr, phy_addr, data);
+}
+
+static uint32_t
+e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr,
+			uint8_t phy_addr)
+{
+	//DPRINTF("Read mdi reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr);
+	switch (reg_addr) {
+	case PHY_STATUS:
+		return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS |
+			MII_SR_AUTONEG_COMPLETE);
+	case PHY_AUTONEG_ADV:
+		return NWAY_AR_SELECTOR_FIELD;
+	case PHY_LP_ABILITY:
+		return 0;
+	case PHY_1000T_STATUS:
+		return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS |
+			SR_1000T_LOCAL_RX_STATUS);
+	case PHY_ID1:
+		return (M88E1011_I_PHY_ID >> 16) & 0xFFFF;
+	case PHY_ID2:
+		return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF;
+	default:
+		DPRINTF("Unknown mdi read reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr);
+		return 0;
+	}
+	/* not reached */
+}
+
+static void
+e82545_eecd_strobe(struct e82545_softc *sc)
+{
+	/* Microwire state machine */
+	/*
+	DPRINTF("eeprom state machine srtobe "
+		"0x%x 0x%x 0x%x 0x%x\r\n",
+		sc->nvm_mode, sc->nvm_bits,
+		sc->nvm_opaddr, sc->nvm_data);*/
+
+	if (sc->nvm_bits == 0) {
+		DPRINTF("eeprom state machine not expecting data! "
+			"0x%x 0x%x 0x%x 0x%x\r\n",
+			sc->nvm_mode, sc->nvm_bits,
+			sc->nvm_opaddr, sc->nvm_data);
+		return;
+	}
+	sc->nvm_bits--;
+	if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) {
+		/* shifting out */
+		if (sc->nvm_data & 0x8000) {
+			sc->eeprom_control |= E1000_EECD_DO;
+		} else {
+			sc->eeprom_control &= ~E1000_EECD_DO;
+		}
+		sc->nvm_data <<= 1;
+		if (sc->nvm_bits == 0) {
+			/* read done, back to opcode mode. */
+			sc->nvm_opaddr = 0;
+			sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+			sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+		}
+	} else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) {
+		/* shifting in */
+		sc->nvm_data <<= 1;
+		if (sc->eeprom_control & E1000_EECD_DI) {
+			sc->nvm_data |= 1;
+		}
+		if (sc->nvm_bits == 0) {
+			/* eeprom write */
+			uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
+			uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK;
+			if (op != E82545_NVM_OPCODE_WRITE) {
+				DPRINTF("Illegal eeprom write op 0x%x\r\n",
+					sc->nvm_opaddr);
+			} else if (addr >= E82545_NVM_EEPROM_SIZE) {
+				DPRINTF("Illegal eeprom write addr 0x%x\r\n",
+					sc->nvm_opaddr);
+			} else {
+				DPRINTF("eeprom write eeprom[0x%x] = 0x%x\r\n",
+				addr, sc->nvm_data);
+				sc->eeprom_data[addr] = sc->nvm_data;
+			}
+			/* back to opcode mode */
+			sc->nvm_opaddr = 0;
+			sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+			sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+		}
+	} else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) {
+		sc->nvm_opaddr <<= 1;
+		if (sc->eeprom_control & E1000_EECD_DI) {
+			sc->nvm_opaddr |= 1;
+		}
+		if (sc->nvm_bits == 0) {
+			uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
+			switch (op) {
+			case E82545_NVM_OPCODE_EWEN:
+				DPRINTF("eeprom write enable: 0x%x\r\n",
+					sc->nvm_opaddr);
+				/* back to opcode mode */
+				sc->nvm_opaddr = 0;
+				sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+				sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+				break;
+			case E82545_NVM_OPCODE_READ:
+			{
+				uint16_t addr = sc->nvm_opaddr &
+					E82545_NVM_ADDR_MASK;
+				sc->nvm_mode = E82545_NVM_MODE_DATAOUT;
+				sc->nvm_bits = E82545_NVM_DATA_BITS;
+				if (addr < E82545_NVM_EEPROM_SIZE) {
+					sc->nvm_data = sc->eeprom_data[addr];
+					DPRINTF("eeprom read: eeprom[0x%x] = 0x%x\r\n",
+						addr, sc->nvm_data);
+				} else {
+					DPRINTF("eeprom illegal read: 0x%x\r\n",
+						sc->nvm_opaddr);
+					sc->nvm_data = 0;
+				}
+				break;
+			}
+			case E82545_NVM_OPCODE_WRITE:
+				sc->nvm_mode = E82545_NVM_MODE_DATAIN;
+				sc->nvm_bits = E82545_NVM_DATA_BITS;
+				sc->nvm_data = 0;
+				break;
+			default:
+				DPRINTF("eeprom unknown op: 0x%x\r\r",
+					sc->nvm_opaddr);
+				/* back to opcode mode */
+				sc->nvm_opaddr = 0;
+				sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+				sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+			}
+		}
+	} else {
+		DPRINTF("eeprom state machine wrong state! "
+			"0x%x 0x%x 0x%x 0x%x\r\n",
+			sc->nvm_mode, sc->nvm_bits,
+			sc->nvm_opaddr, sc->nvm_data);
+	}
+}
+
+static void
+e82545_itr_callback(int fd, enum ev_type type, void *param)
+{
+	uint32_t new;
+	struct e82545_softc *sc = param;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+	new = sc->esc_ICR & sc->esc_IMS;
+	if (new && !sc->esc_irq_asserted) {
+		DPRINTF("itr callback: lintr assert %x\r\n", new);
+		sc->esc_irq_asserted = 1;
+		pci_lintr_assert(sc->esc_pi);
+	} else {
+		mevent_delete(sc->esc_mevpitr);
+		sc->esc_mevpitr = NULL;
+	}
+	pthread_mutex_unlock(&sc->esc_mtx);
+}
+
+static void
+e82545_icr_assert(struct e82545_softc *sc, uint32_t bits)
+{
+	uint32_t new;
+
+	DPRINTF("icr assert: 0x%x\r\n", bits);
+	
+	/*
+	 * An interrupt is only generated if bits are set that
+	 * aren't already in the ICR, these bits are unmasked,
+	 * and there isn't an interrupt already pending.
+	 */
+	new = bits & ~sc->esc_ICR & sc->esc_IMS;
+	sc->esc_ICR |= bits;
+
+	if (new == 0) {
+		DPRINTF("icr assert: masked %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (sc->esc_mevpitr != NULL) {
+		DPRINTF("icr assert: throttled %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (!sc->esc_irq_asserted) {
+		DPRINTF("icr assert: lintr assert %x\r\n", new);
+		sc->esc_irq_asserted = 1;
+		pci_lintr_assert(sc->esc_pi);
+		if (sc->esc_ITR != 0) {
+			sc->esc_mevpitr = mevent_add(
+			    (sc->esc_ITR + 3905) / 3906,  /* 256ns -> 1ms */
+			    EVF_TIMER, e82545_itr_callback, sc);
+		}
+	}
+}
+
+static void
+e82545_ims_change(struct e82545_softc *sc, uint32_t bits)
+{
+	uint32_t new;
+
+	/*
+	 * Changing the mask may allow previously asserted
+	 * but masked interrupt requests to generate an interrupt.
+	 */
+	new = bits & sc->esc_ICR & ~sc->esc_IMS;
+	sc->esc_IMS |= bits;
+
+	if (new == 0) {
+		DPRINTF("ims change: masked %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (sc->esc_mevpitr != NULL) {
+		DPRINTF("ims change: throttled %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (!sc->esc_irq_asserted) {
+		DPRINTF("ims change: lintr assert %x\n\r", new);
+		sc->esc_irq_asserted = 1;
+		pci_lintr_assert(sc->esc_pi);
+		if (sc->esc_ITR != 0) {
+			sc->esc_mevpitr = mevent_add(
+			    (sc->esc_ITR + 3905) / 3906,  /* 256ns -> 1ms */
+			    EVF_TIMER, e82545_itr_callback, sc);
+		}
+	}
+}
+
+static void
+e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits)
+{
+
+	DPRINTF("icr deassert: 0x%x\r\n", bits);
+	sc->esc_ICR &= ~bits;
+
+	/*
+	 * If there are no longer any interrupt sources and there
+	 * was an asserted interrupt, clear it
+	 */
+	if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) {
+		DPRINTF("icr deassert: lintr deassert %x\r\n", bits);
+		pci_lintr_deassert(sc->esc_pi);
+		sc->esc_irq_asserted = 0;
+	}
+}
+
+static void
+e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value)
+{
+
+	DPRINTF("intr_write: off %x, val %x\n\r", offset, value);
+	
+	switch (offset) {
+	case E1000_ICR:
+		e82545_icr_deassert(sc, value);
+		break;
+	case E1000_ITR:
+		sc->esc_ITR = value;
+		break;
+	case E1000_ICS:
+		sc->esc_ICS = value;	/* not used: store for debug */
+		e82545_icr_assert(sc, value);
+		break;
+	case E1000_IMS:
+		e82545_ims_change(sc, value);
+		break;
+	case E1000_IMC:
+		sc->esc_IMC = value;	/* for debug */
+		sc->esc_IMS &= ~value;
+		// XXX clear interrupts if all ICR bits now masked
+		// and interrupt was pending ?
+		break;
+	default:
+		break;
+	}
+}
+
+static uint32_t
+e82545_intr_read(struct e82545_softc *sc, uint32_t offset)
+{
+	uint32_t retval;
+
+	retval = 0;
+
+	DPRINTF("intr_read: off %x\n\r", offset);
+	
+	switch (offset) {
+	case E1000_ICR:
+		retval = sc->esc_ICR;
+		sc->esc_ICR = 0;
+		e82545_icr_deassert(sc, ~0);
+		break;
+	case E1000_ITR:
+		retval = sc->esc_ITR;
+		break;
+	case E1000_ICS:
+		/* write-only register */
+		break;
+	case E1000_IMS:
+		retval = sc->esc_IMS;
+		break;
+	case E1000_IMC:
+		/* write-only register */
+		break;
+	default:
+		break;
+	}
+
+	return (retval);
+}
+
+static void
+e82545_devctl(struct e82545_softc *sc, uint32_t val)
+{
+
+	sc->esc_CTRL = val & ~E1000_CTRL_RST;
+
+	if (val & E1000_CTRL_RST) {
+		DPRINTF("e1k: s/w reset, ctl %x\n", val);
+		e82545_reset(sc, 1);
+	}
+	/* XXX check for phy reset ? */
+}
+
+static void
+e82545_rx_update_rdba(struct e82545_softc *sc)
+{
+
+	/* XXX verify desc base/len within phys mem range */
+	sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 |
+	    sc->esc_RDBAL;
+	
+	/* Cache host mapping of guest descriptor array */
+	sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx,
+	    sc->esc_rdba, sc->esc_RDLEN);	
+}
+
+static void
+e82545_rx_ctl(struct e82545_softc *sc, uint32_t val)
+{
+	int on;
+
+	on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN);
+
+	/* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */
+	sc->esc_RCTL = val & ~0xF9204c01;
+
+	DPRINTF("rx_ctl - %s RCTL %x, val %x\n",
+		on ? "on" : "off", sc->esc_RCTL, val);
+
+	/* state change requested */
+	if (on != sc->esc_rx_enabled) {
+		if (on) {
+			/* Catch disallowed/unimplemented settings */
+			//assert(!(val & E1000_RCTL_LBM_TCVR));
+
+			if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) {
+				sc->esc_rx_loopback = 1;
+			} else {
+				sc->esc_rx_loopback = 0;
+			}
+
+			e82545_rx_update_rdba(sc);
+			e82545_rx_enable(sc);
+		} else {
+			e82545_rx_disable(sc);
+			sc->esc_rx_loopback = 0;
+			sc->esc_rdba = 0;
+			sc->esc_rxdesc = NULL;
+		}
+	}
+}
+
+static void
+e82545_tx_update_tdba(struct e82545_softc *sc)
+{
+
+	/* XXX verify desc base/len within phys mem range */
+	sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL;
+
+	/* Cache host mapping of guest descriptor array */
+	sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba,
+            sc->esc_TDLEN);
+}
+
+static void
+e82545_tx_ctl(struct e82545_softc *sc, uint32_t val)
+{
+	int on;
+	
+	on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN);
+
+	/* ignore TCTL_EN settings that don't change state */
+	if (on == sc->esc_tx_enabled)
+		return;
+
+	if (on) {
+		e82545_tx_update_tdba(sc);
+		e82545_tx_enable(sc);
+	} else {
+		e82545_tx_disable(sc);
+		sc->esc_tdba = 0;
+		sc->esc_txdesc = NULL;
+	}
+
+	/* Save TCTL value after stripping reserved bits 31:25,23,2,0 */
+	sc->esc_TCTL = val & ~0xFE800005;
+}
+
+int
+e82545_bufsz(uint32_t rctl)
+{
+
+	switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) {
+	case (E1000_RCTL_SZ_2048): return (2048);
+	case (E1000_RCTL_SZ_1024): return (1024);
+	case (E1000_RCTL_SZ_512): return (512);
+	case (E1000_RCTL_SZ_256): return (256);
+	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384);
+	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192);
+	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096);
+	}
+	return (256);	/* Forbidden value. */
+}
+
+static uint8_t dummybuf[2048];
+
+/* XXX one packet at a time until this is debugged */
+static void
+e82545_tap_callback(int fd, enum ev_type type, void *param)
+{
+	struct e82545_softc *sc = param;
+	struct e1000_rx_desc *rxd;
+	struct iovec vec[64];
+	int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size;
+	uint32_t cause = 0;
+	uint16_t *tp, tag, head;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+	DPRINTF("rx_run: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT);
+
+	if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
+		DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n",
+		    sc->esc_rx_enabled, sc->esc_rx_loopback);
+		while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+		}
+		goto done1;
+	}
+	bufsz = e82545_bufsz(sc->esc_RCTL);
+	maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522;
+	maxpktdesc = (maxpktsz + bufsz - 1) / bufsz;
+	size = sc->esc_RDLEN / 16;
+	head = sc->esc_RDH;
+	left = (size + sc->esc_RDT - head) % size;
+	if (left < maxpktdesc) {
+		DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n",
+		    left, maxpktdesc);
+		while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+		}
+		goto done1;
+	}
+
+	sc->esc_rx_active = 1;
+	pthread_mutex_unlock(&sc->esc_mtx);
+
+	for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) {
+
+		/* Grab rx descriptor pointed to by the head pointer */
+		for (i = 0; i < maxpktdesc; i++) {
+			rxd = &sc->esc_rxdesc[(head + i) % size];
+			vec[i].iov_base = paddr_guest2host(sc->esc_ctx,
+			    rxd->buffer_addr, bufsz);
+			vec[i].iov_len = bufsz;
+		}
+		len = readv(sc->esc_tapfd, vec, maxpktdesc);
+		if (len <= 0) {
+			DPRINTF("tap: readv() returned %d\n", len);
+			goto done;
+		}
+
+		/*
+		 * Adjust the packet length based on whether the CRC needs
+		 * to be stripped or if the packet is less than the minimum
+		 * eth packet size.
+		 */
+		if (len < ETHER_MIN_LEN - ETHER_CRC_LEN)
+			len = ETHER_MIN_LEN - ETHER_CRC_LEN;
+		if (!(sc->esc_RCTL & E1000_RCTL_SECRC))
+			len += ETHER_CRC_LEN;
+		n = (len + bufsz - 1) / bufsz;
+
+		DPRINTF("packet read %d bytes, %d segs, head %d\r\n",
+		    len, n, head);
+
+		/* Apply VLAN filter. */
+		tp = (uint16_t *)vec[0].iov_base + 6;
+		if ((sc->esc_RCTL & E1000_RCTL_VFE) &&
+		    (ntohs(tp[0]) == sc->esc_VET)) {
+			tag = ntohs(tp[1]) & 0x0fff;
+			if ((sc->esc_fvlan[tag >> 5] &
+			    (1 << (tag & 0x1f))) != 0) {
+				DPRINTF("known VLAN %d\r\n", tag);
+			} else {
+				DPRINTF("unknown VLAN %d\r\n", tag);
+				n = 0;
+				continue;
+			}
+		}
+
+		/* Update all consumed descriptors. */
+		for (i = 0; i < n - 1; i++) {
+			rxd = &sc->esc_rxdesc[(head + i) % size];
+			rxd->length = bufsz;
+			rxd->csum = 0;
+			rxd->errors = 0;
+			rxd->special = 0;
+			rxd->status = E1000_RXD_STAT_DD;
+		}
+		rxd = &sc->esc_rxdesc[(head + i) % size];
+		rxd->length = len % bufsz;
+		rxd->csum = 0;
+		rxd->errors = 0;
+		rxd->special = 0;
+		/* XXX signal no checksum for now */
+		rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM |
+		    E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD;
+
+		/* Schedule receive interrupts. */
+		if (len <= sc->esc_RSRPD) {
+			cause |= E1000_ICR_SRPD | E1000_ICR_RXT0;
+		} else {
+			/* XXX: RDRT and RADV timers should be here. */
+			cause |= E1000_ICR_RXT0;
+		}
+
+		head = (head + n) % size;
+		left -= n;
+	}
+
+done:
+	pthread_mutex_lock(&sc->esc_mtx);
+	sc->esc_rx_active = 0;
+	if (sc->esc_rx_enabled == 0)
+		pthread_cond_signal(&sc->esc_rx_cond);
+
+	sc->esc_RDH = head;
+	/* Respect E1000_RCTL_RDMTS */
+	left = (size + sc->esc_RDT - head) % size;
+	if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1)))
+		cause |= E1000_ICR_RXDMT0;
+	/* Assert all accumulated interrupts. */
+	if (cause != 0)
+		e82545_icr_assert(sc, cause);
+done1:
+	DPRINTF("rx_run done: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT);
+	pthread_mutex_unlock(&sc->esc_mtx);
+}
+
+static uint16_t
+e82545_carry(uint32_t sum)
+{
+
+	sum = (sum & 0xFFFF) + (sum >> 16);
+	if (sum > 0xFFFF)
+		sum -= 0xFFFF;
+	return (sum);
+}
+
+static uint16_t
+e82545_buf_checksum(uint8_t *buf, int len)
+{
+	int i;
+	uint32_t sum = 0;
+
+	/* Checksum all the pairs of bytes first... */
+	for (i = 0; i < (len & ~1U); i += 2)
+		sum += *((u_int16_t *)(buf + i));
+
+	/*
+	 * If there's a single byte left over, checksum it, too.
+	 * Network byte order is big-endian, so the remaining byte is
+	 * the high byte.
+	 */
+	if (i < len)
+		sum += htons(buf[i] << 8);
+
+	return (e82545_carry(sum));
+}
+
+static uint16_t
+e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len)
+{
+	int now, odd;
+	uint32_t sum = 0, s;
+
+	/* Skip completely unneeded vectors. */
+	while (iovcnt > 0 && iov->iov_len <= off && off > 0) {
+		off -= iov->iov_len;
+		iov++;
+		iovcnt--;
+	}
+
+	/* Calculate checksum of requested range. */
+	odd = 0;
+	while (len > 0 && iovcnt > 0) {
+		now = MIN(len, iov->iov_len - off);
+		s = e82545_buf_checksum(iov->iov_base + off, now);
+		sum += odd ? (s << 8) : s;
+		odd ^= (now & 1);
+		len -= now;
+		off = 0;
+		iov++;
+		iovcnt--;
+	}
+
+	return (e82545_carry(sum));
+}
+
+/*
+ * Return the transmit descriptor type.
+ */
+int
+e82545_txdesc_type(uint32_t lower)
+{
+	int type;
+
+	type = 0;
+	
+	if (lower & E1000_TXD_CMD_DEXT)
+		type = lower & E1000_TXD_MASK;
+
+	return (type);
+}
+
+static void
+e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck)
+{
+	uint16_t cksum;
+	int cklen;
+
+	DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d\r\n",
+	    iovcnt, ck->ck_start, ck->ck_off, ck->ck_len);
+	cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX;
+	cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen);
+	*(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum;
+}
+
+static void
+e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt)
+{
+
+	if (sc->esc_tapfd == -1)
+		return;
+
+	(void) writev(sc->esc_tapfd, iov, iovcnt);
+}
+
+static void
+e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail,
+    uint16_t dsize, int *tdwb)
+{
+	union e1000_tx_udesc *dsc;
+
+	for ( ; head != tail; head = (head + 1) % dsize) {
+		dsc = &sc->esc_txdesc[head];
+		if (dsc->td.lower.data & E1000_TXD_CMD_RS) {
+			dsc->td.upper.data |= E1000_TXD_STAT_DD;
+			*tdwb = 1;
+		}
+	}
+}
+
+static int
+e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
+    uint16_t dsize, uint16_t *rhead, int *tdwb)
+{
+	uint8_t *hdr, *hdrp;
+	struct iovec iovb[I82545_MAX_TXSEGS + 2];
+	struct iovec tiov[I82545_MAX_TXSEGS + 2];
+	struct e1000_context_desc *cd;
+	struct ck_info ckinfo[2];
+	struct iovec *iov;
+	union  e1000_tx_udesc *dsc;
+	int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso;
+	int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff;
+	uint32_t tcpsum, tcpseq;
+	uint16_t ipcs, tcpcs, ipid, ohead;
+
+	ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0;
+	iovcnt = 0;
+	tlen = 0;
+	ntype = 0;
+	tso = 0;
+	ohead = head;
+
+	/* iovb[0/1] may be used for writable copy of headers. */
+	iov = &iovb[2];
+
+	for (desc = 0; ; desc++, head = (head + 1) % dsize) {
+		if (head == tail) {
+			*rhead = head;
+			return (0);
+		}
+		dsc = &sc->esc_txdesc[head];
+		dtype = e82545_txdesc_type(dsc->td.lower.data);
+
+		if (desc == 0) {
+			switch (dtype) {
+			case E1000_TXD_TYP_C:
+				DPRINTF("tx ctxt desc idx %d: %016jx "
+				    "%08x%08x\r\n",
+				    head, dsc->td.buffer_addr,
+				    dsc->td.upper.data, dsc->td.lower.data);
+				/* Save context and return */
+				sc->esc_txctx = dsc->cd;
+				goto done;
+			case E1000_TXD_TYP_L:
+				DPRINTF("tx legacy desc idx %d: %08x%08x\r\n",
+				    head, dsc->td.upper.data, dsc->td.lower.data);
+				/*
+				 * legacy cksum start valid in first descriptor
+				 */
+				ntype = dtype;
+				ckinfo[0].ck_start = dsc->td.upper.fields.css;
+				break;
+			case E1000_TXD_TYP_D:
+				DPRINTF("tx data desc idx %d: %08x%08x\r\n",
+				    head, dsc->td.upper.data, dsc->td.lower.data);
+				ntype = dtype;
+				break;
+			default:
+				break;
+			}
+		} else {
+			/* Descriptor type must be consistent */
+			assert(dtype == ntype);
+			DPRINTF("tx next desc idx %d: %08x%08x\r\n",
+			    head, dsc->td.upper.data, dsc->td.lower.data);
+		}
+
+		len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length :
+		    dsc->dd.lower.data & 0xFFFFF;
+
+		if (len > 0) {
+			/* Strip checksum supplied by guest. */
+			if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 &&
+			    (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0)
+				len -= 2;
+			tlen += len;
+			if (iovcnt < I82545_MAX_TXSEGS) {
+				iov[iovcnt].iov_base = paddr_guest2host(
+				    sc->esc_ctx, dsc->td.buffer_addr, len);
+				iov[iovcnt].iov_len = len;
+			}
+			iovcnt++;
+		}
+
+		/*
+		 * Pull out info that is valid in the final descriptor
+		 * and exit descriptor loop.
+		 */
+		if (dsc->td.lower.data & E1000_TXD_CMD_EOP) {
+			if (dtype == E1000_TXD_TYP_L) {
+				if (dsc->td.lower.data & E1000_TXD_CMD_IC) {
+					ckinfo[0].ck_valid = 1;
+					ckinfo[0].ck_off =
+					    dsc->td.lower.flags.cso;
+					ckinfo[0].ck_len = 0;
+				}
+			} else {
+				cd = &sc->esc_txctx;
+				if (dsc->dd.lower.data & E1000_TXD_CMD_TSE)
+					tso = 1;
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_IXSM)
+					ckinfo[0].ck_valid = 1;
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_IXSM || tso) {
+					ckinfo[0].ck_start =
+					    cd->lower_setup.ip_fields.ipcss;
+					ckinfo[0].ck_off =
+					    cd->lower_setup.ip_fields.ipcso;
+					ckinfo[0].ck_len =
+					    cd->lower_setup.ip_fields.ipcse;
+				}
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_TXSM)
+					ckinfo[1].ck_valid = 1;
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_TXSM || tso) {
+					ckinfo[1].ck_start =
+					    cd->upper_setup.tcp_fields.tucss;
+					ckinfo[1].ck_off =
+					    cd->upper_setup.tcp_fields.tucso;
+					ckinfo[1].ck_len =
+					    cd->upper_setup.tcp_fields.tucse;
+				}
+			}
+			break;
+		}
+	}
+
+	if (iovcnt > I82545_MAX_TXSEGS) {
+		WPRINTF("tx too many descriptors (%d > %d) -- dropped\r\n",
+		    iovcnt, I82545_MAX_TXSEGS);
+		goto done;
+	}
+
+	hdrlen = vlen = 0;
+	/* Estimate writable space for VLAN header insertion. */
+	if ((sc->esc_CTRL & E1000_CTRL_VME) &&
+	    (dsc->td.lower.data & E1000_TXD_CMD_VLE)) {
+		hdrlen = ETHER_ADDR_LEN*2;
+		vlen = ETHER_VLAN_ENCAP_LEN;
+	}
+	if (!tso) {
+		/* Estimate required writable space for checksums. */
+		if (ckinfo[0].ck_valid)
+			hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2);
+		if (ckinfo[1].ck_valid)
+			hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2);
+		/* Round up writable space to the first vector. */
+		if (hdrlen != 0 && iov[0].iov_len > hdrlen &&
+		    iov[0].iov_len < hdrlen + 100)
+			hdrlen = iov[0].iov_len;
+	} else {
+		/* In case of TSO header length provided by software. */
+		hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len;
+	}
+
+	/* Allocate, fill and prepend writable header vector. */
+	if (hdrlen != 0) {
+		hdr = __builtin_alloca(hdrlen + vlen);
+		hdr += vlen;
+		for (left = hdrlen, hdrp = hdr; left > 0;
+		    left -= now, hdrp += now) {
+			now = MIN(left, iov->iov_len);
+			memcpy(hdrp, iov->iov_base, now);
+			iov->iov_base += now;
+			iov->iov_len -= now;
+			if (iov->iov_len == 0) {
+				iov++;
+				iovcnt--;
+			}
+		}
+		iov--;
+		iovcnt++;
+		iov->iov_base = hdr;
+		iov->iov_len = hdrlen;
+	}
+
+	/* Insert VLAN tag. */
+	if (vlen != 0) {
+		hdr -= ETHER_VLAN_ENCAP_LEN;
+		memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2);
+		hdrlen += ETHER_VLAN_ENCAP_LEN;
+		hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8;
+		hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff;
+		hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8;
+		hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff;
+		iov->iov_base = hdr;
+		iov->iov_len += ETHER_VLAN_ENCAP_LEN;
+		/* Correct checksum offsets after VLAN tag insertion. */
+		ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN;
+		ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN;
+		if (ckinfo[0].ck_len != 0)
+			ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN;
+		ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN;
+		ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN;
+		if (ckinfo[1].ck_len != 0)
+			ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN;
+	}
+
+	/* Simple non-TSO case. */
+	if (!tso) {
+		/* Calculate checksums and transmit. */
+		if (ckinfo[0].ck_valid)
+			e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]);
+		if (ckinfo[1].ck_valid)
+			e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]);
+		e82545_transmit_backend(sc, iov, iovcnt);
+		goto done;
+	}
+
+	/* Doing TSO. */
+	tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0;
+	mss = sc->esc_txctx.tcp_seg_setup.fields.mss;
+	paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff);
+	DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n",
+	    tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt);
+	ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]);
+	tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]);
+	ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off];
+	tcpcs = 0;
+	if (ckinfo[1].ck_valid)	/* Save partial pseudo-header checksum. */
+		tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off];
+	pv = 1;
+	pvoff = 0;
+	for (seg = 0, left = paylen; left > 0; seg++, left -= now) {
+		now = MIN(left, mss);
+
+		/* Construct IOVs for the segment. */
+		/* Include whole original header. */
+		tiov[0].iov_base = hdr;
+		tiov[0].iov_len = hdrlen;
+		tiovcnt = 1;
+		/* Include respective part of payload IOV. */
+		for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) {
+			nnow = MIN(nleft, iov[pv].iov_len - pvoff);
+			tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff;
+			tiov[tiovcnt++].iov_len = nnow;
+			if (pvoff + nnow == iov[pv].iov_len) {
+				pv++;
+				pvoff = 0;
+			} else
+				pvoff += nnow;
+		}
+		DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n",
+		    seg, hdrlen, now, tiovcnt);
+
+		/* Update IP header. */
+		if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) {
+			/* IPv4 -- set length and ID */
+			*(uint16_t *)&hdr[ckinfo[0].ck_start + 2] =
+			    htons(hdrlen - ckinfo[0].ck_start + now);
+			*(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
+			    htons(ipid + seg);
+		} else {
+			/* IPv6 -- set length */
+			*(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
+			    htons(hdrlen - ckinfo[0].ck_start - 40 +
+				  now);
+		}
+
+		/* Update pseudo-header checksum. */
+		tcpsum = tcpcs;
+		tcpsum += htons(hdrlen - ckinfo[1].ck_start + now);
+
+		/* Update TCP/UDP headers. */
+		if (tcp) {
+			/* Update sequence number and FIN/PUSH flags. */
+			*(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
+			    htonl(tcpseq + paylen - left);
+			if (now < left) {
+				hdr[ckinfo[1].ck_start + 13] &=
+				    ~(TH_FIN | TH_PUSH);
+			}
+		} else {
+			/* Update payload length. */
+			*(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
+			    hdrlen - ckinfo[1].ck_start + now;
+		}
+
+		/* Calculate checksums and transmit. */
+		if (ckinfo[0].ck_valid) {
+			*(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs;
+			e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]);
+		}
+		if (ckinfo[1].ck_valid) {
+			*(uint16_t *)&hdr[ckinfo[1].ck_off] =
+			    e82545_carry(tcpsum);
+			e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]);
+		}
+		e82545_transmit_backend(sc, tiov, tiovcnt);
+	}
+
+done:
+	head = (head + 1) % dsize;
+	e82545_transmit_done(sc, ohead, head, dsize, tdwb);
+
+	*rhead = head;
+	return (desc + 1);
+}
+
+static void
+e82545_tx_run(struct e82545_softc *sc)
+{
+	uint32_t cause;
+	uint16_t head, rhead, tail, size;
+	int lim, tdwb, sent;
+
+	head = sc->esc_TDH;
+	tail = sc->esc_TDT;
+	size = sc->esc_TDLEN / 16;
+	DPRINTF("tx_run: head %x, rhead %x, tail %x\r\n",
+	    sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
+
+	pthread_mutex_unlock(&sc->esc_mtx);
+	rhead = head;
+	tdwb = 0;
+	for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) {
+		sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb);
+		if (sent == 0)
+			break;
+		head = rhead;
+	}
+	pthread_mutex_lock(&sc->esc_mtx);
+
+	sc->esc_TDH = head;
+	sc->esc_TDHr = rhead;
+	cause = 0;
+	if (tdwb)
+		cause |= E1000_ICR_TXDW;
+	if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT)
+		cause |= E1000_ICR_TXQE;
+	if (cause)
+		e82545_icr_assert(sc, cause);
+
+	DPRINTF("tx_run done: head %x, rhead %x, tail %x\r\n",
+	    sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
+}
+
+static void *
+e82545_tx_thread(void *param)
+{
+	struct e82545_softc *sc = param;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+	for (;;) {
+		while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) {
+			if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT)
+				break;
+			sc->esc_tx_active = 0;
+			if (sc->esc_tx_enabled == 0)
+				pthread_cond_signal(&sc->esc_tx_cond);
+			pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
+		}
+		sc->esc_tx_active = 1;
+
+		/* Process some tx descriptors.  Lock dropped inside. */
+		e82545_tx_run(sc);
+	}
+}
+
+static void
+e82545_tx_start(struct e82545_softc *sc)
+{
+
+	if (sc->esc_tx_active == 0)
+		pthread_cond_signal(&sc->esc_tx_cond);
+}
+
+static void
+e82545_tx_enable(struct e82545_softc *sc)
+{
+
+	sc->esc_tx_enabled = 1;
+}
+
+static void
+e82545_tx_disable(struct e82545_softc *sc)
+{
+
+	sc->esc_tx_enabled = 0;
+	while (sc->esc_tx_active)
+		pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
+}
+
+static void
+e82545_rx_enable(struct e82545_softc *sc)
+{
+
+	sc->esc_rx_enabled = 1;
+}
+
+static void
+e82545_rx_disable(struct e82545_softc *sc)
+{
+
+	sc->esc_rx_enabled = 0;
+	while (sc->esc_rx_active)
+		pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx);
+}
+
+static void
+e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval)
+{
+        struct eth_uni *eu;
+	int idx;
+
+	idx = reg >> 1;
+	assert(idx < 15);
+
+	eu = &sc->esc_uni[idx];
+
+	if (reg & 0x1) {
+		/* RAH */
+		eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV);
+		eu->eu_addrsel = (wval >> 16) & 0x3;
+		eu->eu_eth.octet[5] = wval >> 8;
+		eu->eu_eth.octet[4] = wval;
+	} else {
+		/* RAL */
+		eu->eu_eth.octet[3] = wval >> 24;
+		eu->eu_eth.octet[2] = wval >> 16;
+		eu->eu_eth.octet[1] = wval >> 8;
+		eu->eu_eth.octet[0] = wval;
+	}
+}
+
+static uint32_t
+e82545_read_ra(struct e82545_softc *sc, int reg)
+{
+        struct eth_uni *eu;
+	uint32_t retval;
+	int idx;
+
+	idx = reg >> 1;
+	assert(idx < 15);
+
+	eu = &sc->esc_uni[idx];
+
+	if (reg & 0x1) {
+		/* RAH */
+		retval = (eu->eu_valid << 31) |
+			 (eu->eu_addrsel << 16) |
+			 (eu->eu_eth.octet[5] << 8) |
+			 eu->eu_eth.octet[4];
+	} else {
+		/* RAL */
+		retval = (eu->eu_eth.octet[3] << 24) |
+			 (eu->eu_eth.octet[2] << 16) |
+			 (eu->eu_eth.octet[1] << 8) |
+			 eu->eu_eth.octet[0];
+	}
+
+	return (retval);	
+}
+
+static void
+e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value)
+{
+	int ridx;
+	
+	if (offset & 0x3) {
+		DPRINTF("Unaligned register write offset:0x%x value:0x%x\r\n", offset, value);
+		return;
+	}
+	DPRINTF("Register write: 0x%x value: 0x%x\r\n", offset, value);
+
+	switch (offset) {
+	case E1000_CTRL:
+	case E1000_CTRL_DUP:
+		e82545_devctl(sc, value);
+		break;
+	case E1000_FCAL:
+		sc->esc_FCAL = value;
+		break;
+	case E1000_FCAH:
+		sc->esc_FCAH = value & ~0xFFFF0000;
+		break;
+	case E1000_FCT:
+		sc->esc_FCT = value & ~0xFFFF0000;
+		break;
+	case E1000_VET:
+		sc->esc_VET = value & ~0xFFFF0000;
+		break;
+	case E1000_FCTTV:
+		sc->esc_FCTTV = value & ~0xFFFF0000;
+		break;
+	case E1000_LEDCTL:
+		sc->esc_LEDCTL = value & ~0x30303000;
+		break;
+	case E1000_PBA:
+		sc->esc_PBA = value & 0x0000FF80;
+		break;
+	case E1000_ICR:
+	case E1000_ITR:
+	case E1000_ICS:
+	case E1000_IMS:
+	case E1000_IMC:
+		e82545_intr_write(sc, offset, value);
+		break;
+	case E1000_RCTL:
+		e82545_rx_ctl(sc, value);
+		break;
+	case E1000_FCRTL:
+		sc->esc_FCRTL = value & ~0xFFFF0007;
+		break;
+	case E1000_FCRTH:
+		sc->esc_FCRTH = value & ~0xFFFF0007;
+		break;
+	case E1000_RDBAL(0):
+		sc->esc_RDBAL = value & ~0xF;
+		if (sc->esc_rx_enabled) {
+			/* Apparently legal: update cached address */
+			e82545_rx_update_rdba(sc);
+		}
+		break;
+	case E1000_RDBAH(0):
+		assert(!sc->esc_rx_enabled);
+		sc->esc_RDBAH = value;
+		break;
+	case E1000_RDLEN(0):
+		assert(!sc->esc_rx_enabled);
+		sc->esc_RDLEN = value & ~0xFFF0007F;
+		break;
+	case E1000_RDH(0):
+		/* XXX should only ever be zero ? Range check ? */
+		sc->esc_RDH = value;
+		break;
+	case E1000_RDT(0):
+		/* XXX if this opens up the rx ring, do something ? */
+		sc->esc_RDT = value;
+		break;
+	case E1000_RDTR:
+		/* ignore FPD bit 31 */
+		sc->esc_RDTR = value & ~0xFFFF0000;
+		break;
+	case E1000_RXDCTL(0):
+		sc->esc_RXDCTL = value & ~0xFEC0C0C0;
+		break;
+	case E1000_RADV:
+		sc->esc_RADV = value & ~0xFFFF0000;
+		break;
+	case E1000_RSRPD:
+		sc->esc_RSRPD = value & ~0xFFFFF000;
+		break;
+	case E1000_RXCSUM:
+		sc->esc_RXCSUM = value & ~0xFFFFF800;
+		break;
+	case E1000_TXCW:
+		sc->esc_TXCW = value & ~0x3FFF0000;
+		break;
+	case E1000_TCTL:
+		e82545_tx_ctl(sc, value);
+		break;
+	case E1000_TIPG:
+		sc->esc_TIPG = value;
+		break;
+	case E1000_AIT:
+		sc->esc_AIT = value;
+		break;
+	case E1000_TDBAL(0):
+		sc->esc_TDBAL = value & ~0xF;
+		if (sc->esc_tx_enabled) {
+			/* Apparently legal */
+			e82545_tx_update_tdba(sc);
+		}
+		break;
+	case E1000_TDBAH(0):
+		//assert(!sc->esc_tx_enabled);		
+		sc->esc_TDBAH = value;
+		break;
+	case E1000_TDLEN(0):
+		//assert(!sc->esc_tx_enabled);
+		sc->esc_TDLEN = value & ~0xFFF0007F;
+		break;
+	case E1000_TDH(0):
+		//assert(!sc->esc_tx_enabled);
+		/* XXX should only ever be zero ? Range check ? */
+		sc->esc_TDHr = sc->esc_TDH = value;
+		break;
+	case E1000_TDT(0):
+		/* XXX range check ? */
+		sc->esc_TDT = value;
+		if (sc->esc_tx_enabled)
+			e82545_tx_start(sc);
+		break;
+	case E1000_TIDV:
+		sc->esc_TIDV = value & ~0xFFFF0000;
+		break;
+	case E1000_TXDCTL(0):
+		//assert(!sc->esc_tx_enabled);
+		sc->esc_TXDCTL = value & ~0xC0C0C0;
+		break;
+	case E1000_TADV:
+		sc->esc_TADV = value & ~0xFFFF0000;
+		break;
+	case E1000_RAL(0) ... E1000_RAH(15):
+		/* convert to u32 offset */
+		ridx = (offset - E1000_RAL(0)) >> 2;
+		e82545_write_ra(sc, ridx, value);
+		break;
+	case E1000_MTA ... (E1000_MTA + (127*4)):
+		sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value;
+		break;
+	case E1000_VFTA ... (E1000_VFTA + (127*4)):
+		sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value;
+		break;		
+	case E1000_EECD:
+	{
+		//DPRINTF("EECD write 0x%x -> 0x%x\r\n", sc->eeprom_control, value);
+		/* edge triggered low->high */
+		uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ?
+			0 : (value & E1000_EECD_SK));
+		uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS|
+					E1000_EECD_DI|E1000_EECD_REQ);
+		sc->eeprom_control &= ~eecd_mask;
+		sc->eeprom_control |= (value & eecd_mask);
+		/* grant/revoke immediately */
+		if (value & E1000_EECD_REQ) {
+			sc->eeprom_control |= E1000_EECD_GNT;
+		} else {
+                        sc->eeprom_control &= ~E1000_EECD_GNT;
+		}
+		if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) {
+			e82545_eecd_strobe(sc);
+		}
+		return;
+	}
+	case E1000_MDIC:
+	{
+		uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >>
+						E1000_MDIC_REG_SHIFT);
+		uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >>
+						E1000_MDIC_PHY_SHIFT);
+		sc->mdi_control =
+			(value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST));
+		if ((value & E1000_MDIC_READY) != 0) {
+			DPRINTF("Incorrect MDIC ready bit: 0x%x\r\n", value);
+			return;
+		}
+		switch (value & E82545_MDIC_OP_MASK) {
+		case E1000_MDIC_OP_READ:
+			sc->mdi_control &= ~E82545_MDIC_DATA_MASK;
+			sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr);
+			break;
+		case E1000_MDIC_OP_WRITE:
+			e82545_write_mdi(sc, reg_addr, phy_addr,
+				value & E82545_MDIC_DATA_MASK);
+			break;
+		default:
+			DPRINTF("Unknown MDIC op: 0x%x\r\n", value);
+			return;
+		}
+		/* TODO: barrier? */
+		sc->mdi_control |= E1000_MDIC_READY;
+		if (value & E82545_MDIC_IE) {
+			// TODO: generate interrupt
+		}
+		return;
+	}
+	case E1000_MANC:
+	case E1000_STATUS: 
+		return;
+	default:
+		DPRINTF("Unknown write register: 0x%x value:%x\r\n", offset, value);
+		return;
+	}
+}
+
+static uint32_t
+e82545_read_register(struct e82545_softc *sc, uint32_t offset)
+{
+	uint32_t retval;
+	int ridx;
+	
+	if (offset & 0x3) {
+		DPRINTF("Unaligned register read offset:0x%x\r\n", offset);
+		return 0;
+	}
+		
+	DPRINTF("Register read: 0x%x\r\n", offset);
+
+	switch (offset) {
+	case E1000_CTRL:
+		retval = sc->esc_CTRL;
+		break;
+	case E1000_STATUS:
+		retval = E1000_STATUS_FD | E1000_STATUS_LU |
+		    E1000_STATUS_SPEED_1000;
+		break;
+	case E1000_FCAL:
+		retval = sc->esc_FCAL;
+		break;
+	case E1000_FCAH:
+		retval = sc->esc_FCAH;
+		break;
+	case E1000_FCT:
+		retval = sc->esc_FCT;
+		break;
+	case E1000_VET:
+		retval = sc->esc_VET;
+		break;
+	case E1000_FCTTV:
+		retval = sc->esc_FCTTV;
+		break;
+	case E1000_LEDCTL:
+		retval = sc->esc_LEDCTL;
+		break;
+	case E1000_PBA:
+		retval = sc->esc_PBA;
+		break;
+	case E1000_ICR:
+	case E1000_ITR:
+	case E1000_ICS:
+	case E1000_IMS:
+	case E1000_IMC:
+		retval = e82545_intr_read(sc, offset);
+		break;
+	case E1000_RCTL:
+		retval = sc->esc_RCTL;
+		break;
+	case E1000_FCRTL:
+		retval = sc->esc_FCRTL;
+		break;
+	case E1000_FCRTH:
+		retval = sc->esc_FCRTH;
+		break;
+	case E1000_RDBAL(0):
+		retval = sc->esc_RDBAL;
+		break;
+	case E1000_RDBAH(0):
+		retval = sc->esc_RDBAH;
+		break;
+	case E1000_RDLEN(0):
+		retval = sc->esc_RDLEN;
+		break;
+	case E1000_RDH(0):
+		retval = sc->esc_RDH;
+		break;
+	case E1000_RDT(0):
+		retval = sc->esc_RDT;
+		break;
+	case E1000_RDTR:
+		retval = sc->esc_RDTR;
+		break;
+	case E1000_RXDCTL(0):
+		retval = sc->esc_RXDCTL;
+		break;
+	case E1000_RADV:
+		retval = sc->esc_RADV;
+		break;
+	case E1000_RSRPD:
+		retval = sc->esc_RSRPD;
+		break;
+	case E1000_RXCSUM:	       
+		retval = sc->esc_RXCSUM;
+		break;
+	case E1000_TXCW:
+		retval = sc->esc_TXCW;
+		break;
+	case E1000_TCTL:
+		retval = sc->esc_TCTL;
+		break;
+	case E1000_TIPG:
+		retval = sc->esc_TIPG;
+		break;
+	case E1000_AIT:
+		retval = sc->esc_AIT;
+		break;
+	case E1000_TDBAL(0):
+		retval = sc->esc_TDBAL;
+		break;
+	case E1000_TDBAH(0):
+		retval = sc->esc_TDBAH;
+		break;
+	case E1000_TDLEN(0):
+		retval = sc->esc_TDLEN;
+		break;
+	case E1000_TDH(0):
+		retval = sc->esc_TDH;
+		break;
+	case E1000_TDT(0):
+		retval = sc->esc_TDT;
+		break;
+	case E1000_TIDV:
+		retval = sc->esc_TIDV;
+		break;
+	case E1000_TXDCTL(0):
+		retval = sc->esc_TXDCTL;
+		break;
+	case E1000_TADV:
+		retval = sc->esc_TADV;
+		break;
+	case E1000_RAL(0) ... E1000_RAH(15):
+		/* convert to u32 offset */
+		ridx = (offset - E1000_RAL(0)) >> 2;
+		retval = e82545_read_ra(sc, ridx);
+		break;
+	case E1000_MTA ... (E1000_MTA + (127*4)):
+		retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2];
+		break;
+	case E1000_VFTA ... (E1000_VFTA + (127*4)):
+		retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2];
+		break;		
+	case E1000_EECD:
+		//DPRINTF("EECD read %x\r\n", sc->eeprom_control);
+		retval = sc->eeprom_control;
+		break;
+	case E1000_MDIC:
+		retval = sc->mdi_control;
+		break;
+	case E1000_MANC:
+		retval = 0;
+		break;
+	/* stats that we emulate. */
+	case E1000_MPC:
+		retval = sc->missed_pkt_count;
+		break;
+	case E1000_PRC64:
+		retval = sc->pkt_rx_by_size[0];
+		break;
+	case E1000_PRC127:
+		retval = sc->pkt_rx_by_size[1];
+		break;
+	case E1000_PRC255:
+		retval = sc->pkt_rx_by_size[2];
+		break;
+	case E1000_PRC511:
+		retval = sc->pkt_rx_by_size[3];
+		break;
+	case E1000_PRC1023:
+		retval = sc->pkt_rx_by_size[4];
+		break;
+	case E1000_PRC1522:
+		retval = sc->pkt_rx_by_size[5];
+		break;
+	case E1000_GPRC:
+		retval = sc->good_pkt_rx_count;
+		break;
+	case E1000_BPRC:
+		retval = sc->bcast_pkt_rx_count;
+		break;
+	case E1000_MPRC:
+		retval = sc->mcast_pkt_rx_count;
+		break;
+	case E1000_GPTC:
+	case E1000_TPT:
+		retval = sc->good_pkt_tx_count;
+		break;
+	case E1000_GORCL:
+		retval = (uint32_t)sc->good_octets_rx;
+		break;
+	case E1000_GORCH:
+		retval = (uint32_t)(sc->good_octets_rx >> 32);
+		break;
+	case E1000_TOTL:
+	case E1000_GOTCL:
+		retval = (uint32_t)sc->good_octets_tx;
+		break;
+	case E1000_TOTH:
+	case E1000_GOTCH:
+		retval = (uint32_t)(sc->good_octets_tx >> 32);
+		break;
+	case E1000_ROC:
+		retval = sc->oversize_rx_count;
+		break;
+	case E1000_TORL:
+		retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets);
+		break;
+	case E1000_TORH:
+		retval = (uint32_t)((sc->good_octets_rx +
+		    sc->missed_octets) >> 32);
+		break;
+	case E1000_TPR:
+		retval = sc->good_pkt_rx_count + sc->missed_pkt_count +
+		    sc->oversize_rx_count;
+		break;
+	case E1000_PTC64:
+		retval = sc->pkt_tx_by_size[0];
+		break;
+	case E1000_PTC127:
+		retval = sc->pkt_tx_by_size[1];
+		break;
+	case E1000_PTC255:
+		retval = sc->pkt_tx_by_size[2];
+		break;
+	case E1000_PTC511:
+		retval = sc->pkt_tx_by_size[3];
+		break;
+	case E1000_PTC1023:
+		retval = sc->pkt_tx_by_size[4];
+		break;
+	case E1000_PTC1522:
+		retval = sc->pkt_tx_by_size[5];
+		break;
+	case E1000_MPTC:
+		retval = sc->mcast_pkt_tx_count;
+		break;
+	case E1000_BPTC:
+		retval = sc->bcast_pkt_tx_count;
+		break;
+	case E1000_TSCTC:
+		retval = sc->tso_tx_count;
+		break;
+	/* stats that are always 0. */
+	case E1000_CRCERRS:
+	case E1000_ALGNERRC:
+	case E1000_SYMERRS:
+	case E1000_RXERRC:
+	case E1000_SCC:
+	case E1000_ECOL:
+	case E1000_MCC:
+	case E1000_LATECOL:
+	case E1000_COLC:
+	case E1000_DC:
+	case E1000_TNCRS:
+	case E1000_SEC:
+	case E1000_CEXTERR:
+	case E1000_RLEC:
+	case E1000_XONRXC:
+	case E1000_XONTXC:
+	case E1000_XOFFRXC:
+	case E1000_XOFFTXC:
+	case E1000_FCRUC:
+	case E1000_RNBC:
+	case E1000_RUC:
+	case E1000_RFC:
+	case E1000_RJC:
+	case E1000_MGTPRC:
+	case E1000_MGTPDC:
+	case E1000_MGTPTC:
+	case E1000_TSCTFC:
+		retval = 0;
+		break;
+	default:
+		DPRINTF("Unknown read register: 0x%x\r\n", offset);
+		retval = 0;
+		break;
+	}
+
+	return (retval);
+}
+
+static void
+e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	     uint64_t offset, int size, uint64_t value)
+{
+	struct e82545_softc *sc;
+
+	//DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d\r\n", baridx, offset, value, size);
+
+	sc = pi->pi_arg;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+
+	switch (baridx) {
+	case E82545_BAR_IO:
+		switch (offset) {
+		case E82545_IOADDR:
+			if (size != 4) {
+				DPRINTF("Wrong io addr write sz:%d value:0x%lx\r\n", size, value);
+			} else
+				sc->io_addr = (uint32_t)value;
+			break;
+		case E82545_IODATA:
+			if (size != 4) {
+				DPRINTF("Wrong io data write size:%d value:0x%lx\r\n", size, value);
+			} else if (sc->io_addr > E82545_IO_REGISTER_MAX) {
+				DPRINTF("Non-register io write addr:0x%x value:0x%lx\r\n", sc->io_addr, value);
+			} else
+				e82545_write_register(sc, sc->io_addr,
+						      (uint32_t)value);
+			break;
+		default:
+			DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d\r\n", offset, value, size);
+			break;
+		}
+		break;
+	case E82545_BAR_REGISTER:
+		if (size != 4) {
+			DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx\r\n", size, offset, value);
+		} else
+			e82545_write_register(sc, (uint32_t)offset,
+					      (uint32_t)value);
+		break;
+	default:
+		DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d\r\n",
+			baridx, offset, value, size);
+	}
+
+	pthread_mutex_unlock(&sc->esc_mtx);
+}
+
+static uint64_t
+e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	    uint64_t offset, int size)
+{
+	struct e82545_softc *sc;
+	uint64_t retval;
+	
+	//DPRINTF("Read  bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size);
+	sc = pi->pi_arg;
+	retval = 0;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+
+	switch (baridx) {
+	case E82545_BAR_IO:
+		switch (offset) {
+		case E82545_IOADDR:
+			if (size != 4) {
+				DPRINTF("Wrong io addr read sz:%d\r\n", size);
+			} else
+				retval = sc->io_addr;
+			break;
+		case E82545_IODATA:
+			if (size != 4) {
+				DPRINTF("Wrong io data read sz:%d\r\n", size);
+			}
+			if (sc->io_addr > E82545_IO_REGISTER_MAX) {
+				DPRINTF("Non-register io read addr:0x%x\r\n",
+					sc->io_addr);
+			} else
+				retval = e82545_read_register(sc, sc->io_addr);
+			break;
+		default:
+			DPRINTF("Unknown io bar read offset:0x%lx size:%d\r\n",
+				offset, size);
+			break;
+		}
+		break;
+	case E82545_BAR_REGISTER:
+		if (size != 4) {
+			DPRINTF("Wrong register read size:%d offset:0x%lx\r\n",
+				size, offset);
+		} else
+			retval = e82545_read_register(sc, (uint32_t)offset);
+		break;
+	default:
+		DPRINTF("Unknown read bar:%d offset:0x%lx size:%d\r\n",
+			baridx, offset, size);
+		break;
+	}
+
+	pthread_mutex_unlock(&sc->esc_mtx);
+
+	return (retval);
+}
+
+static void
+e82545_reset(struct e82545_softc *sc, int drvr)
+{
+	int i;
+
+	e82545_rx_disable(sc);
+	e82545_tx_disable(sc);
+
+	/* clear outstanding interrupts */
+	if (sc->esc_irq_asserted)
+		pci_lintr_deassert(sc->esc_pi);
+
+	/* misc */
+	if (!drvr) {
+		sc->esc_FCAL = 0;
+		sc->esc_FCAH = 0;
+		sc->esc_FCT = 0;
+		sc->esc_VET = 0;
+		sc->esc_FCTTV = 0;
+	}
+	sc->esc_LEDCTL = 0x07061302;
+	sc->esc_PBA = 0x00100030;
+	
+	/* start nvm in opcode mode. */
+	sc->nvm_opaddr = 0;
+	sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+	sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+	sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN;
+	e82545_init_eeprom(sc);
+
+	/* interrupt */
+	sc->esc_ICR = 0;
+	sc->esc_ITR = 250;
+	sc->esc_ICS = 0;
+	sc->esc_IMS = 0;
+	sc->esc_IMC = 0;
+		
+	/* L2 filters */
+	if (!drvr) {
+		memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan));
+		memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast));
+		memset(sc->esc_uni, 0, sizeof(sc->esc_uni));
+
+		/* XXX not necessary on 82545 ?? */
+		sc->esc_uni[0].eu_valid = 1;
+		memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet,
+		    ETHER_ADDR_LEN);
+	} else {
+		/* Clear RAH valid bits */
+		for (i = 0; i < 16; i++)
+			sc->esc_uni[i].eu_valid = 0;
+	}
+	
+	/* receive */
+	if (!drvr) {
+		sc->esc_RDBAL = 0;
+		sc->esc_RDBAH = 0;
+	}
+	sc->esc_RCTL = 0;
+	sc->esc_FCRTL = 0;
+	sc->esc_FCRTH = 0;
+	sc->esc_RDLEN = 0;
+	sc->esc_RDH = 0;
+	sc->esc_RDT = 0;
+	sc->esc_RDTR = 0;
+	sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */
+	sc->esc_RADV = 0;
+	sc->esc_RXCSUM = 0;
+
+	/* transmit */
+	if (!drvr) {
+		sc->esc_TDBAL = 0;
+		sc->esc_TDBAH = 0;
+		sc->esc_TIPG = 0;
+		sc->esc_AIT = 0;
+		sc->esc_TIDV = 0;
+		sc->esc_TADV = 0;
+	}
+	sc->esc_tdba = 0;
+	sc->esc_txdesc = NULL;
+	sc->esc_TXCW = 0;
+	sc->esc_TCTL = 0;
+	sc->esc_TDLEN = 0;
+	sc->esc_TDT = 0;
+	sc->esc_TDHr = sc->esc_TDH = 0;
+	sc->esc_TXDCTL = 0;
+}
+
+static void
+e82545_open_tap(struct e82545_softc *sc, char *opts)
+{
+	char tbuf[80];
+	
+	if (opts == NULL) {
+		sc->esc_tapfd = -1;
+		return;
+	}
+
+	strcpy(tbuf, "/dev/");
+	strlcat(tbuf, opts, sizeof(tbuf));
+
+	sc->esc_tapfd = open(tbuf, O_RDWR);
+	if (sc->esc_tapfd == -1) {
+		DPRINTF("unable to open tap device %s\n", opts);
+		exit(1);
+	}
+
+	/*
+	 * Set non-blocking and register for read
+	 * notifications with the event loop
+	 */
+	int opt = 1;
+	if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) {
+		WPRINTF("tap device O_NONBLOCK failed: %d\n", errno);
+		close(sc->esc_tapfd);
+		sc->esc_tapfd = -1;
+	}
+
+	sc->esc_mevp = mevent_add(sc->esc_tapfd,
+				  EVF_READ,
+				  e82545_tap_callback,
+				  sc);
+	if (sc->esc_mevp == NULL) {
+		DPRINTF("Could not register mevent %d\n", EVF_READ);
+		close(sc->esc_tapfd);
+		sc->esc_tapfd = -1;
+	}
+}
+
+static int
+e82545_parsemac(char *mac_str, uint8_t *mac_addr)
+{
+	struct ether_addr *ea;
+	char *tmpstr;
+	char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+	tmpstr = strsep(&mac_str,"=");
+	if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+		ea = ether_aton(mac_str);
+		if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+		    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+			fprintf(stderr, "Invalid MAC %s\n", mac_str);
+			return (1);
+		} else
+			memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+	}
+	return (0);
+}
+
+static int
+e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	DPRINTF("Loading with options: %s\r\n", opts);
+
+	MD5_CTX mdctx;
+	unsigned char digest[16];
+	char nstr[80];
+	struct e82545_softc *sc;
+	char *devname;
+	char *vtopts;
+	int mac_provided;
+
+	/* Setup our softc */
+	sc = calloc(sizeof(*sc), 1);
+
+	pi->pi_arg = sc;
+	sc->esc_pi = pi;
+	sc->esc_ctx = ctx;
+
+	pthread_mutex_init(&sc->esc_mtx, NULL);
+	pthread_cond_init(&sc->esc_rx_cond, NULL);
+	pthread_cond_init(&sc->esc_tx_cond, NULL);
+	pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc);
+	snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot,
+	    pi->pi_func);
+        pthread_set_name_np(sc->esc_tx_tid, nstr);
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL);
+	pci_set_cfgdata8(pi,  PCIR_CLASS, PCIC_NETWORK);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL);
+
+	pci_set_cfgdata8(pi,  PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+	pci_set_cfgdata8(pi,  PCIR_INTPIN, 0x1);
+	
+	/* TODO: this card also supports msi, but the freebsd driver for it
+	 * does not, so I have not implemented it. */
+	pci_lintr_request(pi);
+
+	pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32,
+		E82545_BAR_REGISTER_LEN);
+	pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32,
+		E82545_BAR_FLASH_LEN);
+	pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO,
+		E82545_BAR_IO_LEN);
+
+	/*
+	 * Attempt to open the tap device and read the MAC address
+	 * if specified.  Copied from virtio-net, slightly modified.
+	 */
+	mac_provided = 0;
+	sc->esc_tapfd = -1;
+	if (opts != NULL) {
+		int err;
+
+		devname = vtopts = strdup(opts);
+		(void) strsep(&vtopts, ",");
+
+		if (vtopts != NULL) {
+			err = e82545_parsemac(vtopts, sc->esc_mac.octet);
+			if (err != 0) {
+				free(devname);
+				return (err);
+			}
+			mac_provided = 1;
+		}
+
+		if (strncmp(devname, "tap", 3) == 0 ||
+		    strncmp(devname, "vmnet", 5) == 0)
+			e82545_open_tap(sc, devname);
+
+		free(devname);
+	}
+
+	/*
+	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
+	 * followed by an MD5 of the PCI slot/func number and dev name
+	 */
+	if (!mac_provided) {
+		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+		    pi->pi_func, vmname);
+
+		MD5Init(&mdctx);
+		MD5Update(&mdctx, nstr, strlen(nstr));
+		MD5Final(digest, &mdctx);
+
+		sc->esc_mac.octet[0] = 0x00;
+		sc->esc_mac.octet[1] = 0xa0;
+		sc->esc_mac.octet[2] = 0x98;
+		sc->esc_mac.octet[3] = digest[0];
+		sc->esc_mac.octet[4] = digest[1];
+		sc->esc_mac.octet[5] = digest[2];
+	}
+
+	/* H/w initiated reset */
+	e82545_reset(sc, 0);
+
+	return (0);
+}
+
+struct pci_devemu pci_de_e82545 = {
+	.pe_emu = 	"e1000",
+	.pe_init =	e82545_init,
+	.pe_barwrite =	e82545_write,
+	.pe_barread =	e82545_read
+};
+PCI_EMUL_SET(pci_de_e82545);
+
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index d74950b..0fffb19 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -230,7 +230,7 @@ int	pci_msi_enabled(struct pci_devinst *pi);
 int	pci_msix_enabled(struct pci_devinst *pi);
 int	pci_msix_table_bar(struct pci_devinst *pi);
 int	pci_msix_pba_bar(struct pci_devinst *pi);
-int	pci_msi_msgnum(struct pci_devinst *pi);
+int	pci_msi_maxmsgnum(struct pci_devinst *pi);
 int	pci_parse_slot(char *opt);
 void	pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
 int	pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
author	Renato Botelho <renato@netgate.com>	2016-08-25 10:41:37 -0300
committer	Renato Botelho <renato@netgate.com>	2016-08-25 10:41:37 -0300
commit	29ebd1247162a77db08e5e2e00d033220ec807fe (patch)
tree	d45bd4c2da327a132f18b6f39db36fe188c4e029
parent	75cd8d40056c799f03b759475d9bfd10ba266a6c (diff)
parent	c29dc2b4296960868edafe94ebf975be284200bb (diff)
download	FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.zip FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.tar.gz