Merge remote-tracking branch 'origin/stable/10' into devel

author: Renato Botelho <renato@netgate.com> 2016-08-25 10:41:37 -0300
committer: Renato Botelho <renato@netgate.com> 2016-08-25 10:41:37 -0300
commit: 29ebd1247162a77db08e5e2e00d033220ec807fe (patch)
tree: d45bd4c2da327a132f18b6f39db36fe188c4e029 /sys
parent: 75cd8d40056c799f03b759475d9bfd10ba266a6c (diff)
parent: c29dc2b4296960868edafe94ebf975be284200bb (diff)
download: FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.zip
FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.tar.gz
57 files changed, 4698 insertions, 4941 deletions
diff --git a/sys/boot/efi/loader/Makefile b/sys/boot/efi/loader/Makefile
index bc38ea6..2c430ec 100644
--- a/sys/boot/efi/loader/Makefile
+++ b/sys/boot/efi/loader/Makefile
@@ -50,6 +50,18 @@ CFLAGS+=	-DEFI_ZFS_BOOT
 .endif
 CFLAGS+=	-DNO_PCI -DEFI
 
+.if !defined(BOOT_HIDE_SERIAL_NUMBERS)
+# Export serial numbers, UUID, and asset tag from loader.
+CFLAGS+= -DSMBIOS_SERIAL_NUMBERS
+.if defined(BOOT_LITTLE_ENDIAN_UUID)
+# Use little-endian UUID format as defined in SMBIOS 2.6.
+CFLAGS+= -DSMBIOS_LITTLE_ENDIAN_UUID
+.elif defined(BOOT_NETWORK_ENDIAN_UUID)
+# Use network-endian UUID format for backward compatibility.
+CFLAGS+= -DSMBIOS_NETWORK_ENDIAN_UUID
+.endif
+.endif
+
 .if ${MK_FORTH} != "no"
 BOOT_FORTH=	yes
 CFLAGS+=	-DBOOT_FORTH
diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h
index 251d62d..1d56ac7 100644
--- a/sys/cam/cam_ccb.h
+++ b/sys/cam/cam_ccb.h
@@ -1084,7 +1084,17 @@ struct ccb_notify_acknowledge {
 	u_int     tag_id;		/* Tag for immediate notify */
 	u_int     seq_id;		/* Tar for target of notify */
 	u_int     initiator_id;		/* Initiator Identifier */
-	u_int     arg;			/* Function specific */
+	u_int     arg;			/* Response information */
+	/*
+	 * Lower byte of arg is one of RESPONSE CODE values defined below
+	 * (subset of response codes from SPL-4 and FCP-4 specifications),
+	 * upper 3 bytes is code-specific ADDITIONAL RESPONSE INFORMATION.
+	 */
+#define	CAM_RSP_TMF_COMPLETE		0x00
+#define	CAM_RSP_TMF_REJECTED		0x04
+#define	CAM_RSP_TMF_FAILED		0x05
+#define	CAM_RSP_TMF_SUCCEEDED		0x08
+#define	CAM_RSP_TMF_INCORRECT_LUN	0x09
 };
 
 /* HBA engine structures. */
diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
index 5455eea..8ec048b 100644
--- a/sys/cam/ctl/ctl.c
+++ b/sys/cam/ctl/ctl.c
@@ -1818,6 +1818,7 @@ ctl_init(void)
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	softc->flags = 0;
 
+	TUNABLE_INT_FETCH("kern.cam.ctl.ha_mode", (int *)&softc->ha_mode);
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_mode", CTLFLAG_RDTUN, (int *)&softc->ha_mode, 0,
 	    "HA mode (0 - act/stby, 1 - serialize only, 2 - xfer)");
@@ -1827,6 +1828,7 @@ ctl_init(void)
 	 * figured out through the slot the controller is in.  Although it
 	 * is an active/active system, someone has to be in charge.
 	 */
+	TUNABLE_INT_FETCH("kern.cam.ctl.ha_id", &softc->ha_id);
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_id", CTLFLAG_RDTUN, &softc->ha_id, 0,
 	    "HA head ID (0 - no HA)");
diff --git a/sys/cam/ctl/scsi_ctl.c b/sys/cam/ctl/scsi_ctl.c
index 2705240..abdbdcd 100644
--- a/sys/cam/ctl/scsi_ctl.c
+++ b/sys/cam/ctl/scsi_ctl.c
@@ -1552,6 +1552,7 @@ ctlfedone(struct cam_periph *periph, union ccb *done_ccb)
 		/*
 		 * Queue this back down to the SIM as an immediate notify.
 		 */
+		done_ccb->ccb_h.status = CAM_REQ_INPROG;
 		done_ccb->ccb_h.func_code = XPT_IMMEDIATE_NOTIFY;
 		xpt_action(done_ccb);
 		break;
@@ -2040,6 +2041,28 @@ ctlfe_done(union ctl_io *io)
 		 */
 		ccb->ccb_h.status = CAM_REQ_INPROG;
 		ccb->ccb_h.func_code = XPT_NOTIFY_ACKNOWLEDGE;
+		switch (io->taskio.task_status) {
+		case CTL_TASK_FUNCTION_COMPLETE:
+			ccb->cna2.arg = CAM_RSP_TMF_COMPLETE;
+			break;
+		case CTL_TASK_FUNCTION_SUCCEEDED:
+			ccb->cna2.arg = CAM_RSP_TMF_SUCCEEDED;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		case CTL_TASK_FUNCTION_REJECTED:
+			ccb->cna2.arg = CAM_RSP_TMF_REJECTED;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		case CTL_TASK_LUN_DOES_NOT_EXIST:
+			ccb->cna2.arg = CAM_RSP_TMF_INCORRECT_LUN;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		case CTL_TASK_FUNCTION_NOT_SUPPORTED:
+			ccb->cna2.arg = CAM_RSP_TMF_FAILED;
+			ccb->ccb_h.flags |= CAM_SEND_STATUS;
+			break;
+		}
+		ccb->cna2.arg |= scsi_3btoul(io->taskio.task_resp) << 8;
 		xpt_action(ccb);
 	} else if (io->io_hdr.flags & CTL_FLAG_STATUS_SENT) {
 		if (softc->flags & CTLFE_LUN_WILDCARD) {
diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h
index 4e5b1c9..019efdf 100644
--- a/sys/cddl/compat/opensolaris/sys/vnode.h
+++ b/sys/cddl/compat/opensolaris/sys/vnode.h
@@ -87,8 +87,6 @@ vn_is_readonly(vnode_t *vp)
 #define	VN_RELE(v)	vrele(v)
 #define	VN_URELE(v)	vput(v)
 
-#define	VOP_REALVP(vp, vpp, ct)	(*(vpp) = (vp), 0)
-
 #define	vnevent_create(vp, ct)			do { } while (0)
 #define	vnevent_link(vp, ct)			do { } while (0)
 #define	vnevent_remove(vp, dvp, name, ct)	do { } while (0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
index 349f8ef..22d8e60 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -48,18 +48,18 @@ extern "C" {
 #define	IS_ROOT_NODE	0x01		/* create a root node */
 #define	IS_XATTR	0x02		/* create an extended attribute node */
 
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
-    int, int *, pathname_t *);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
+extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
     boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
-    pathname_t *);
+#if 0
+extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
+#else
+extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
+#endif
 extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
     uint_t, znode_t **, zfs_acl_ids_t *);
 extern void zfs_rmnode(znode_t *);
-extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
 extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
 extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
index 4120883..df5ce05 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -75,6 +75,7 @@ struct zfsvfs {
 	boolean_t	z_use_fuids;	/* version allows fuids */
 	boolean_t	z_replay;	/* set during ZIL replay */
 	boolean_t	z_use_sa;	/* version allow system attributes */
+	boolean_t	z_use_namecache;/* make use of FreeBSD name cache */
 	uint64_t	z_version;	/* ZPL version */
 	uint64_t	z_shares_dir;	/* hidden shares dir */
 	kmutex_t	z_lock;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index 3e72ec4..7649295 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -181,10 +181,12 @@ typedef struct znode {
 	struct zfsvfs	*z_zfsvfs;
 	vnode_t		*z_vnode;
 	uint64_t	z_id;		/* object ID for this znode */
+#ifdef illumos
 	kmutex_t	z_lock;		/* znode modification lock */
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
+#endif
 	kmutex_t	z_range_lock;	/* protects changes to z_range_avl */
 	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
 	uint8_t		z_unlinked;	/* file has been unlinked */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index fd1d59b..2e94ccc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -1058,8 +1058,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
  * create a new acl and leave any cached acl in place.
  */
 static int
-zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
-    boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 {
 	zfs_acl_t	*aclp;
 	int		aclsize;
@@ -1068,26 +1067,15 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
 	zfs_acl_phys_t	znode_acl;
 	int		version;
 	int		error;
-	boolean_t	drop_lock = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 
 	if (zp->z_acl_cached && !will_modify) {
 		*aclpp = zp->z_acl_cached;
 		return (0);
 	}
 
-	/*
-	 * close race where znode could be upgrade while trying to
-	 * read the znode attributes.
-	 *
-	 * But this could only happen if the file isn't already an SA
-	 * znode
-	 */
-	if (!zp->z_is_sa && !have_lock) {
-		mutex_enter(&zp->z_lock);
-		drop_lock = B_TRUE;
-	}
 	version = zfs_znode_acl_version(zp);
 
 	if ((error = zfs_acl_znode_info(zp, &aclsize,
@@ -1133,8 +1121,6 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
 	if (!will_modify)
 		zp->z_acl_cached = aclp;
 done:
-	if (drop_lock)
-		mutex_exit(&zp->z_lock);
 	return (error);
 }
 
@@ -1161,10 +1147,10 @@ zfs_acl_chown_setattr(znode_t *zp)
 	int error;
 	zfs_acl_t *aclp;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
-	if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+	if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
 		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
 		    &zp->z_pflags, zp->z_uid, zp->z_gid);
 	return (error);
@@ -1445,18 +1431,17 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 	int error = 0;
 
 	mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
 		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
 	else
-		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+		error = zfs_acl_node_read(zp, aclp, B_TRUE);
 
 	if (error == 0) {
 		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
 		zfs_acl_chmod(ZTOV(zp)->v_type, mode,
 		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
 	}
-	mutex_exit(&zp->z_lock);
 	mutex_exit(&zp->z_acl_lock);
 
 	return (error);
@@ -1627,6 +1612,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 	boolean_t	need_chmod = B_TRUE;
 	boolean_t	inherited = B_FALSE;
 
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
 	bzero(acl_ids, sizeof (zfs_acl_ids_t));
 	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
@@ -1710,12 +1696,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 
 	if (acl_ids->z_aclp == NULL) {
 		mutex_enter(&dzp->z_acl_lock);
-		mutex_enter(&dzp->z_lock);
 		if (!(flag & IS_ROOT_NODE) &&
 		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
 		    !(dzp->z_pflags & ZFS_XATTR)) {
-			VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
-			    &paclp, B_FALSE));
+			VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
 			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
 			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
 			inherited = B_TRUE;
@@ -1724,7 +1708,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
 			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
 			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
 		}
-		mutex_exit(&dzp->z_lock);
 		mutex_exit(&dzp->z_acl_lock);
 		if (need_chmod) {
 			acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
@@ -1790,7 +1773,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
@@ -1938,6 +1922,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	boolean_t	fuid_dirtied;
 	uint64_t	acl_obj;
 
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	if (mask == 0)
 		return (SET_ERROR(ENOSYS));
 
@@ -1962,7 +1947,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	}
 top:
 	mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 
@@ -1994,7 +1978,6 @@ top:
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		mutex_exit(&zp->z_lock);
 		mutex_exit(&zp->z_acl_lock);
 
 		if (error == ERESTART) {
@@ -2020,7 +2003,6 @@ top:
 	if (fuidp)
 		zfs_fuid_info_free(fuidp);
 	dmu_tx_commit(tx);
-	mutex_exit(&zp->z_lock);
 	mutex_exit(&zp->z_acl_lock);
 
 	return (error);
@@ -2124,7 +2106,8 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
 
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index cf42ff6..f8f695b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -58,96 +58,64 @@
 #include <sys/extdirent.h>
 
 /*
- * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
  * of names after deciding which is the appropriate lookup interface.
  */
 static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
-    boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+    boolean_t exact, uint64_t *zoid)
 {
 	int error;
 
 	if (zfsvfs->z_norm) {
-		matchtype_t mt = MT_FIRST;
-		boolean_t conflict = B_FALSE;
-		size_t bufsz = 0;
-		char *buf = NULL;
-
-		if (rpnp) {
-			buf = rpnp->pn_buf;
-			bufsz = rpnp->pn_bufsize;
-		}
-		if (exact)
-			mt = MT_EXACT;
+		matchtype_t mt = exact? MT_EXACT : MT_FIRST;
+
 		/*
 		 * In the non-mixed case we only expect there would ever
 		 * be one match, but we need to use the normalizing lookup.
 		 */
 		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
-		    zoid, mt, buf, bufsz, &conflict);
-		if (!error && deflags)
-			*deflags = conflict ? ED_CASE_CONFLICT : 0;
+		    zoid, mt, NULL, 0, NULL);
 	} else {
 		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
 	}
 	*zoid = ZFS_DIRENT_OBJ(*zoid);
 
-	if (error == ENOENT && update)
-		dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-
 	return (error);
 }
 
 /*
- * Lock a directory entry.  A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object.  As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
  *
  * Input arguments:
  *	dzp	- znode for directory
  *	name	- name of entry to lock
  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
- *		  ZSHARED: allow concurrent access with other ZSHARED callers.
  *		  ZXATTR: we want dzp's xattr directory
- *		  ZCILOOK: On a mixed sensitivity file system,
- *			   this lookup should be case-insensitive.
- *		  ZCIEXACT: On a purely case-insensitive file system,
- *			    this lookup should be case-sensitive.
- *		  ZRENAMING: we are locking for renaming, force narrow locks
- *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
- *			     current thread already holds it.
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
- *	dlpp	- pointer to the dirlock for this entry (NULL on error)
- *      direntflags - (case-insensitive lookup only)
- *		flags if multiple case-sensitive matches exist in directory
- *      realpnp     - (case-insensitive lookup only)
- *		actual name matched within the directory
  *
  * Return value: 0 on success or errno on failure.
  *
  * NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- *	 but return znode pointers to a single match.
  */
 int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
-    int flag, int *direntflags, pathname_t *realpnp)
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	*dl;
-	boolean_t	update;
 	boolean_t	exact;
 	uint64_t	zoid;
 	vnode_t		*vp = NULL;
 	int		error = 0;
-	int		cmpflags;
+
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
 
 	*zpp = NULL;
-	*dlpp = NULL;
 
 	/*
 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
@@ -161,280 +129,93 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 	 * Case sensitivity and normalization preferences are set when
 	 * the file system is created.  These are stored in the
 	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
-	 * affect what vnodes can be cached in the DNLC, how we
-	 * perform zap lookups, and the "width" of our dirlocks.
+	 * affect how we perform zap lookups.
 	 *
-	 * A normal dirlock locks a single name.  Note that with
-	 * normalization a name can be composed multiple ways, but
-	 * when normalized, these names all compare equal.  A wide
-	 * dirlock locks multiple names.  We need these when the file
-	 * system is supporting mixed-mode access.  It is sometimes
-	 * necessary to lock all case permutations of file name at
-	 * once so that simultaneous case-insensitive/case-sensitive
-	 * behaves as rationally as possible.
-	 */
-
-	/*
 	 * Decide if exact matches should be requested when performing
 	 * a zap lookup on file systems supporting case-insensitive
 	 * access.
-	 */
-	exact =
-	    ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
-
-	/*
-	 * Only look in or update the DNLC if we are looking for the
-	 * name on a file system that does not require normalization
-	 * or case folding.  We can also look there if we happen to be
-	 * on a non-normalizing, mixed sensitivity file system IF we
-	 * are looking for the exact name.
 	 *
-	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
-	 * case for performance improvement?
-	 */
-	update = !zfsvfs->z_norm ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
-	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
-
-	/*
-	 * ZRENAMING indicates we are in a situation where we should
-	 * take narrow locks regardless of the file system's
-	 * preferences for normalizing and case folding.  This will
-	 * prevent us deadlocking trying to grab the same wide lock
-	 * twice if the two names happen to be case-insensitive
-	 * matches.
-	 */
-	if (flag & ZRENAMING)
-		cmpflags = 0;
-	else
-		cmpflags = zfsvfs->z_norm;
-
-	/*
-	 * Wait until there are no locks on this name.
-	 *
-	 * Don't grab the the lock if it is already held. However, cannot
-	 * have both ZSHARED and ZHAVELOCK together.
-	 */
-	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
-	if (!(flag & ZHAVELOCK))
-		rw_enter(&dzp->z_name_lock, RW_READER);
-
-	mutex_enter(&dzp->z_lock);
-	for (;;) {
-		if (dzp->z_unlinked && !(flag & ZXATTR)) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (SET_ERROR(ENOENT));
-		}
-		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
-			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
-			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
-				break;
-		}
-		if (error != 0) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (SET_ERROR(ENOENT));
-		}
-		if (dl == NULL)	{
-			size_t namesize;
-
-			/*
-			 * Allocate a new dirlock and add it to the list.
-			 */
-			namesize = strlen(name) + 1;
-			dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize,
-			    KM_SLEEP);
-			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
-			dl->dl_name = (char *)(dl + 1);
-			bcopy(name, dl->dl_name, namesize);
-			dl->dl_sharecnt = 0;
-			dl->dl_namelock = 0;
-			dl->dl_namesize = namesize;
-			dl->dl_dzp = dzp;
-			dl->dl_next = dzp->z_dirlocks;
-			dzp->z_dirlocks = dl;
-			break;
-		}
-		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
-			break;
-		cv_wait(&dl->dl_cv, &dzp->z_lock);
-	}
-
-	/*
-	 * If the z_name_lock was NOT held for this dirlock record it.
+	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+	 * because in that case MT_EXACT and MT_FIRST should produce exactly
+	 * the same result.
 	 */
-	if (flag & ZHAVELOCK)
-		dl->dl_namelock = 1;
+	exact = zfsvfs->z_case == ZFS_CASE_MIXED;
 
-	if (flag & ZSHARED)
-		dl->dl_sharecnt++;
-
-	mutex_exit(&dzp->z_lock);
-
-	/*
-	 * We have a dirlock on the name.  (Note that it is the dirlock,
-	 * not the dzp's z_lock, that protects the name in the zap object.)
-	 * See if there's an object by this name; if so, put a hold on it.
-	 */
+	if (dzp->z_unlinked && !(flag & ZXATTR))
+		return (ENOENT);
 	if (flag & ZXATTR) {
 		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
 		    sizeof (zoid));
 		if (error == 0)
 			error = (zoid == 0 ? ENOENT : 0);
 	} else {
-		if (update)
-			vp = dnlc_lookup(ZTOV(dzp), name);
-		if (vp == DNLC_NO_VNODE) {
-			VN_RELE(vp);
-			error = SET_ERROR(ENOENT);
-		} else if (vp) {
-			if (flag & ZNEW) {
-				zfs_dirent_unlock(dl);
-				VN_RELE(vp);
-				return (SET_ERROR(EEXIST));
-			}
-			*dlpp = dl;
-			*zpp = VTOZ(vp);
-			return (0);
-		} else {
-			error = zfs_match_find(zfsvfs, dzp, name, exact,
-			    update, direntflags, realpnp, &zoid);
-		}
+		error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
 	}
 	if (error) {
 		if (error != ENOENT || (flag & ZEXISTS)) {
-			zfs_dirent_unlock(dl);
 			return (error);
 		}
 	} else {
 		if (flag & ZNEW) {
-			zfs_dirent_unlock(dl);
 			return (SET_ERROR(EEXIST));
 		}
 		error = zfs_zget(zfsvfs, zoid, zpp);
-		if (error) {
-			zfs_dirent_unlock(dl);
+		if (error)
 			return (error);
-		}
-		if (!(flag & ZXATTR) && update)
-			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+		ASSERT(!(*zpp)->z_unlinked);
 	}
 
-	*dlpp = dl;
-
 	return (0);
 }
 
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
 {
-	znode_t *dzp = dl->dl_dzp;
-	zfs_dirlock_t **prev_dl, *cur_dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	znode_t *zp;
+	uint64_t parent;
+	int error;
 
-	mutex_enter(&dzp->z_lock);
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
-	if (!dl->dl_namelock)
-		rw_exit(&dzp->z_name_lock);
+	if (dzp->z_unlinked)
+		return (ENOENT);
 
-	if (dl->dl_sharecnt > 1) {
-		dl->dl_sharecnt--;
-		mutex_exit(&dzp->z_lock);
-		return;
-	}
-	prev_dl = &dzp->z_dirlocks;
-	while ((cur_dl = *prev_dl) != dl)
-		prev_dl = &cur_dl->dl_next;
-	*prev_dl = dl->dl_next;
-	cv_broadcast(&dl->dl_cv);
-	mutex_exit(&dzp->z_lock);
-
-	cv_destroy(&dl->dl_cv);
-	kmem_free(dl, sizeof (*dl) + dl->dl_namesize);
+	if ((error = sa_lookup(dzp->z_sa_hdl,
+	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+		return (error);
+
+	error = zfs_zget(zfsvfs, parent, &zp);
+	if (error == 0)
+		*zpp = zp;
+	return (error);
 }
 
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- *	no directory entries are actually stored for them.  If this is
- *	the root of a filesystem, then '.zfs' is also treated as a
- *	special pseudo-directory.
- */
 int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
-    int *deflg, pathname_t *rpnp)
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
 {
-	zfs_dirlock_t *dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	znode_t *zp;
 	int error = 0;
-	uint64_t parent;
-	int unlinked;
-
-	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
-		mutex_enter(&dzp->z_lock);
-		unlinked = dzp->z_unlinked;
-		mutex_exit(&dzp->z_lock);
-		if (unlinked)
-			return (ENOENT);
-
-		*vpp = ZTOV(dzp);
-		VN_HOLD(*vpp);
-	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
-		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 
-		/*
-		 * If we are a snapshot mounted under .zfs, return
-		 * the vp for the snapshot directory.
-		 */
-		if ((error = sa_lookup(dzp->z_sa_hdl,
-		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
-			return (error);
-		if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
-			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
-			    "snapshot", vpp, NULL, 0, NULL, kcred,
-			    NULL, NULL, NULL);
-			return (error);
-		}
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
-		mutex_enter(&dzp->z_lock);
-		unlinked = dzp->z_unlinked;
-		mutex_exit(&dzp->z_lock);
-		if (unlinked)
-			return (ENOENT);
+	if (dzp->z_unlinked)
+		return (SET_ERROR(ENOENT));
 
-		rw_enter(&dzp->z_parent_lock, RW_READER);
-		error = zfs_zget(zfsvfs, parent, &zp);
-		if (error == 0)
-			*vpp = ZTOV(zp);
-		rw_exit(&dzp->z_parent_lock);
-	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
-		*vpp = zfsctl_root(dzp);
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*zpp = dzp;
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		error = zfs_dd_lookup(dzp, zpp);
 	} else {
-		int zf;
-
-		zf = ZEXISTS | ZSHARED;
-		if (flags & FIGNORECASE)
-			zf |= ZCILOOK;
-
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
 		if (error == 0) {
-			*vpp = ZTOV(zp);
-			zfs_dirent_unlock(dl);
 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+			*zpp = zp;
 		}
-		rpnp = NULL;
 	}
-
-	if ((flags & FIGNORECASE) && rpnp && !error)
-		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
 	return (error);
 }
 
@@ -510,8 +291,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 		if (error != 0)
 			continue;
 
+		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 		zp->z_unlinked = B_TRUE;
-		VN_RELE(ZTOV(zp));
+		vput(ZTOV(zp));
 	}
 	zap_cursor_fini(&zc);
 }
@@ -535,7 +317,6 @@ zfs_purgedir(znode_t *dzp)
 	znode_t		*xzp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	dl;
 	int skipped = 0;
 	int error;
 
@@ -549,6 +330,7 @@ zfs_purgedir(znode_t *dzp)
 			continue;
 		}
 
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
 		    (ZTOV(xzp)->v_type == VLNK));
 
@@ -563,20 +345,17 @@ zfs_purgedir(znode_t *dzp)
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
-			VN_RELE(ZTOV(xzp));
+			vput(ZTOV(xzp));
 			skipped += 1;
 			continue;
 		}
-		bzero(&dl, sizeof (dl));
-		dl.dl_dzp = dzp;
-		dl.dl_name = zap.za_name;
 
-		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
 		if (error)
 			skipped += 1;
 		dmu_tx_commit(tx);
 
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT)
@@ -596,6 +375,7 @@ zfs_rmnode(znode_t *zp)
 	int		error;
 
 	ASSERT(zp->z_links == 0);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	/*
 	 * If this is an attribute directory, purge its contents.
@@ -634,7 +414,8 @@ zfs_rmnode(znode_t *zp)
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT(error == 0);
+		ASSERT3S(error, ==, 0);
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	acl_obj = zfs_external_acl(zp);
@@ -668,12 +449,10 @@ zfs_rmnode(znode_t *zp)
 
 	if (xzp) {
 		ASSERT(error == 0);
-		mutex_enter(&xzp->z_lock);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
 		xzp->z_links = 0;	/* no more links to it */
 		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 		    &xzp->z_links, sizeof (xzp->z_links), tx));
-		mutex_exit(&xzp->z_lock);
 		zfs_unlinked_add(xzp, tx);
 	}
 
@@ -686,7 +465,7 @@ zfs_rmnode(znode_t *zp)
 	dmu_tx_commit(tx);
 out:
 	if (xzp)
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 }
 
 static uint64_t
@@ -700,12 +479,12 @@ zfs_dirent(znode_t *zp, uint64_t mode)
 }
 
 /*
- * Link zp into dl.  Can only fail if zp has been unlinked.
+ * Link zp into dzp.  Can only fail if zp has been unlinked.
  */
 int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
 {
-	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	uint64_t value;
@@ -715,18 +494,32 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 	int count = 0;
 	int error;
 
-	mutex_enter(&zp->z_lock);
-
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#if 0
+	if (zp_is_dir) {
+		error = 0;
+		if (dzp->z_links >= LINK_MAX)
+			error = SET_ERROR(EMLINK);
+		return (error);
+	}
+#endif
 	if (!(flag & ZRENAMING)) {
 		if (zp->z_unlinked) {	/* no new links to unlinked zp */
 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
-			mutex_exit(&zp->z_lock);
 			return (SET_ERROR(ENOENT));
 		}
+#if 0
+		if (zp->z_links >= LINK_MAX) {
+			return (SET_ERROR(EMLINK));
+		}
+#endif
 		zp->z_links++;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 		    &zp->z_links, sizeof (zp->z_links));
 
+	} else {
+		ASSERT(zp->z_unlinked == 0);
 	}
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 	    &dzp->z_id, sizeof (dzp->z_id));
@@ -740,11 +533,8 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 		    ctime, B_TRUE);
 	}
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-
-	mutex_exit(&zp->z_lock);
+	ASSERT0(error);
 
-	mutex_enter(&dzp->z_lock);
 	dzp->z_size++;
 	dzp->z_links += zp_is_dir;
 	count = 0;
@@ -760,55 +550,48 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 	    &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-	mutex_exit(&dzp->z_lock);
+	ASSERT0(error);
 
 	value = zfs_dirent(zp, zp->z_mode);
-	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
 	    8, 1, &value, tx);
-	ASSERT(error == 0);
-
-	dnlc_update(ZTOV(dzp), dl->dl_name, vp);
+	VERIFY0(error);
 
 	return (0);
 }
 
 static int
-zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
     int flag)
 {
 	int error;
 
 	if (zp->z_zfsvfs->z_norm) {
-		if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
-		    (flag & ZCIEXACT)) ||
-		    ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
-		    !(flag & ZCILOOK)))
+		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_EXACT, tx);
+			    dzp->z_id, name, MT_EXACT, tx);
 		else
 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_FIRST, tx);
+			    dzp->z_id, name, MT_FIRST, tx);
 	} else {
 		error = zap_remove(zp->z_zfsvfs->z_os,
-		    dzp->z_id, dl->dl_name, tx);
+		    dzp->z_id, name, tx);
 	}
 
 	return (error);
 }
 
 /*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
  * and it's the caller's job to do it.
  */
 int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
-    boolean_t *unlinkedp)
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag, boolean_t *unlinkedp)
 {
-	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	int zp_is_dir = (vp->v_type == VDIR);
@@ -818,22 +601,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 	int count = 0;
 	int error;
 
-	dnlc_remove(ZTOV(dzp), dl->dl_name);
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	if (!(flag & ZRENAMING)) {
-		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
-			return (SET_ERROR(EBUSY));
-
-		if (vn_ismntpt(vp)) {		/* don't remove mount point */
-			vn_vfsunlock(vp);
-			return (SET_ERROR(EBUSY));
-		}
-
-		mutex_enter(&zp->z_lock);
 
 		if (zp_is_dir && !zfs_dirempty(zp)) {
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
 #ifdef illumos
 			return (SET_ERROR(EEXIST));
 #else
@@ -846,10 +619,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 		 * First try removing the name from the directory; if that
 		 * fails, return the error.
 		 */
-		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0) {
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
 			return (error);
 		}
 
@@ -876,16 +647,14 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 		    NULL, &zp->z_links, sizeof (zp->z_links));
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		count = 0;
-		ASSERT(error == 0);
-		mutex_exit(&zp->z_lock);
-		vn_vfsunlock(vp);
+		ASSERT0(error);
 	} else {
-		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		ASSERT(zp->z_unlinked == 0);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0)
 			return (error);
 	}
 
-	mutex_enter(&dzp->z_lock);
 	dzp->z_size--;		/* one dirent removed */
 	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
@@ -900,8 +669,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-	mutex_exit(&dzp->z_lock);
+	ASSERT0(error);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
@@ -912,14 +680,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 }
 
 /*
- * Indicate whether the directory is empty.  Works with or without z_lock
- * held, but can only be consider a hint in the latter case.  Returns true
- * if only "." and ".." remain and there's no work in progress.
+ * Indicate whether the directory is empty.
  */
 boolean_t
 zfs_dirempty(znode_t *dzp)
 {
-	return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
+	return (dzp->z_size == 2);
 }
 
 int
@@ -1013,23 +779,20 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	znode_t		*xzp;
-	zfs_dirlock_t	*dl;
 	vattr_t		va;
 	int		error;
 top:
-	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
 	if (error)
 		return (error);
 
 	if (xzp != NULL) {
 		*xvpp = ZTOV(xzp);
-		zfs_dirent_unlock(dl);
 		return (0);
 	}
 
 
 	if (!(flags & CREATE_XATTR_DIR)) {
-		zfs_dirent_unlock(dl);
 #ifdef illumos
 		return (SET_ERROR(ENOENT));
 #else
@@ -1038,7 +801,6 @@ top:
 	}
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		zfs_dirent_unlock(dl);
 		return (SET_ERROR(EROFS));
 	}
 
@@ -1058,7 +820,6 @@ top:
 	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
-	zfs_dirent_unlock(dl);
 
 	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
index 3a472aa..819eca2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
@@ -124,7 +124,7 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	xoptattr_t *xoap;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
 	if (zp->z_is_sa) {
 		if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -158,7 +158,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	xoptattr_t *xoap;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
 	if (zp->z_is_sa)
 		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -205,7 +205,6 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
 	uint64_t crtime[2], mtime[2], ctime[2];
 	zfs_acl_phys_t znode_acl;
 	char scanstamp[AV_SCANSTAMP_SZ];
-	boolean_t drop_lock = B_FALSE;
 
 	/*
 	 * No upgrade if ACL isn't cached
@@ -217,20 +216,16 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
 		return;
 
 	/*
-	 * If the z_lock is held and we aren't the owner
-	 * the just return since we don't want to deadlock
+	 * If the vnode lock is held and we aren't the owner
+	 * then just return since we don't want to deadlock
 	 * trying to update the status of z_is_sa.  This
 	 * file can then be upgraded at a later time.
 	 *
 	 * Otherwise, we know we are doing the
 	 * sa_update() that caused us to enter this function.
 	 */
-	if (mutex_owner(&zp->z_lock) != curthread) {
-		if (mutex_tryenter(&zp->z_lock) == 0)
+	if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
 			return;
-		else
-			drop_lock = B_TRUE;
-	}
 
 	/* First do a bulk query of the attributes that aren't cached */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
@@ -311,8 +306,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
 
 	zp->z_is_sa = B_TRUE;
 done:
-	if (drop_lock)
-		mutex_exit(&zp->z_lock);
+	VOP_UNLOCK(ZTOV(zp), 0);
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 8523bc4..aa711f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -957,6 +957,18 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 	else if (error != 0)
 		return (error);
 
+	/*
+	 * Only use the name cache if we are looking for a
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name (which is always the case on
+	 * FreeBSD).
+	 */
+	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
 	return (0);
 }
 
@@ -997,7 +1009,11 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
+#ifdef DIAGNOSTIC
+	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -2044,7 +2060,7 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
-		VN_RELE(ZTOV(zp));
+		vrele(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err == 0)
@@ -2145,7 +2161,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 			VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
-			VN_HOLD(*vpp);
+			vref(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		err = vn_lock(*vpp, flags);
@@ -2168,7 +2184,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
-		VN_RELE(ZTOV(zp));
+		vrele(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 17179f6..e2fe974 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -66,7 +66,6 @@
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
-#include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
@@ -147,7 +146,7 @@
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
- *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
@@ -1433,26 +1432,81 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
 	return (error);
 }
 
-/*
- * If vnode is for a device return a specfs vnode instead.
- */
 static int
-specvp_check(vnode_t **vpp, cred_t *cr)
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
-	int error = 0;
-
-	if (IS_DEVVP(*vpp)) {
-		struct vnode *svp;
+	int error;
 
-		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-		VN_RELE(*vpp);
-		if (svp == NULL)
-			error = SET_ERROR(ENOSYS);
-		*vpp = svp;
-	}
+	*vpp = arg;
+	error = vn_lock(*vpp, lkflags);
+	if (error != 0)
+		vrele(*vpp);
 	return (error);
 }
 
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int error;
+	int ltype;
+
+	ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+	ASSERT(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		ASSERT3P(dvp, ==, vp);
+		vref(dvp);
+		ltype = lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(dvp)) {
+			if (ltype == LK_EXCLUSIVE)
+				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+			else /* if (ltype == LK_SHARED) */
+				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+			/*
+			 * Relock for the "." case could leave us with
+			 * reclaimed vnode.
+			 */
+			if (dvp->v_iflag & VI_DOOMED) {
+				vrele(dvp);
+				return (SET_ERROR(ENOENT));
+			}
+		}
+		return (0);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		/*
+		 * Note that in this case, dvp is the child vnode, and we
+		 * are looking up the parent vnode - exactly reverse from
+		 * normal operation.  Unlocking dvp requires some rather
+		 * tricky unlock/relock dance to prevent mp from being freed;
+		 * use vn_vget_ino_gen() which takes care of all that.
+		 *
+		 * XXX Note that there is a time window when both vnodes are
+		 * unlocked.  It is possible, although highly unlikely, that
+		 * during that window the parent-child relationship between
+		 * the vnodes may change, for example, get reversed.
+		 * In that case we would have a wrong lock order for the vnodes.
+		 * All other filesystems seem to ignore this problem, so we
+		 * do the same here.
+		 * A potential solution could be implemented as follows:
+		 * - using LK_NOWAIT when locking the second vnode and retrying
+		 *   if necessary
+		 * - checking that the parent-child relationship still holds
+		 *   after locking both vnodes and retrying if it doesn't
+		 */
+		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+		return (error);
+	} else {
+		error = vn_lock(vp, lkflags);
+		if (error != 0)
+			vrele(vp);
+		return (error);
+	}
+}
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
@@ -1465,8 +1519,6 @@ specvp_check(vnode_t **vpp, cred_t *cr)
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
- *		direntflags - directory lookup flags
- *		realpnp - returned pathname.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
@@ -1481,46 +1533,17 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
+	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
-	int *direntflags = NULL;
-	void *realpnp = NULL;
-
-	/* fast path */
-	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
+	/* fast path (should be redundant with vfs namecache) */
+	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
-
-		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
-			error = zfs_fastaccesschk_execute(zdp, cr);
-			if (!error) {
-				*vpp = dvp;
-				VN_HOLD(*vpp);
-				return (0);
-			}
-			return (error);
-		} else {
-			vnode_t *tvp = dnlc_lookup(dvp, nm);
-
-			if (tvp) {
-				error = zfs_fastaccesschk_execute(zdp, cr);
-				if (error) {
-					VN_RELE(tvp);
-					return (error);
-				}
-				if (tvp == DNLC_NO_VNODE) {
-					VN_RELE(tvp);
-					return (SET_ERROR(ENOENT));
-				} else {
-					*vpp = tvp;
-					return (specvp_check(vpp, cr));
-				}
-			}
-		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
@@ -1558,10 +1581,9 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
-
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
-			VN_RELE(*vpp);
+			vrele(*vpp);
 			*vpp = NULL;
 		}
 
@@ -1569,15 +1591,9 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 		return (error);
 	}
 
-	if (dvp->v_type != VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOTDIR));
-	}
-
 	/*
 	 * Check accessibility of directory.
 	 */
-
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -1589,9 +1605,90 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 		return (SET_ERROR(EILSEQ));
 	}
 
-	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
-	if (error == 0)
-		error = specvp_check(vpp, cr);
+
+	/*
+	 * First handle the special cases.
+	 */
+	if ((cnp->cn_flags & ISDOTDOT) != 0) {
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+			    "snapshot", vpp, NULL, 0, NULL, kcred,
+			    NULL, NULL, NULL);
+			ZFS_EXIT(zfsvfs);
+			if (error == 0) {
+				error = zfs_lookup_lock(dvp, *vpp, nm,
+				    cnp->cn_lkflags);
+			}
+			goto out;
+		}
+	}
+	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+		error = 0;
+		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+			error = SET_ERROR(ENOTSUP);
+		else
+			*vpp = zfsctl_root(zdp);
+		ZFS_EXIT(zfsvfs);
+		if (error == 0)
+			error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		goto out;
+	}
+
+	/*
+	 * The loop is retry the lookup if the parent-child relationship
+	 * changes during the dot-dot locking complexities.
+	 */
+	for (;;) {
+		uint64_t parent;
+
+		error = zfs_dirlook(zdp, nm, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+
+		ZFS_EXIT(zfsvfs);
+		if (error != 0)
+			break;
+
+		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		if (error != 0) {
+			/*
+			 * If we've got a locking error, then the vnode
+			 * got reclaimed because of a force unmount.
+			 * We never enter doomed vnodes into the name cache.
+			 */
+			*vpp = NULL;
+			return (error);
+		}
+
+		if ((cnp->cn_flags & ISDOTDOT) == 0)
+			break;
+
+		ZFS_ENTER(zfsvfs);
+		if (zdp->z_sa_hdl == NULL) {
+			error = SET_ERROR(EIO);
+		} else {
+			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+			    &parent, sizeof (parent));
+		}
+		if (error != 0) {
+			ZFS_EXIT(zfsvfs);
+			vput(ZTOV(zp));
+			break;
+		}
+		if (zp->z_id == parent) {
+			ZFS_EXIT(zfsvfs);
+			break;
+		}
+		vput(ZTOV(zp));
+	}
+
+out:
+	if (error != 0)
+		*vpp = NULL;
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
@@ -1610,42 +1707,20 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 			break;
 		}
 	}
-	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
-		int ltype = 0;
 
-		if (cnp->cn_flags & ISDOTDOT) {
-			ltype = VOP_ISLOCKED(dvp);
-			VOP_UNLOCK(dvp, 0);
-		}
-		ZFS_EXIT(zfsvfs);
-		error = vn_lock(*vpp, cnp->cn_lkflags);
-		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(dvp, ltype | LK_RETRY);
-		if (error != 0) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-			return (error);
-		}
-	} else {
-		ZFS_EXIT(zfsvfs);
-	}
+	/* Insert name into cache (as non-existent) if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(dvp, NULL, cnp);
 
-#ifdef FREEBSD_NAMECACHE
-	/*
-	 * Insert name into cache (as non-existent) if appropriate.
-	 */
-	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
-		cache_enter(dvp, *vpp, cnp);
-	/*
-	 * Insert name into cache if appropriate.
-	 */
-	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+	/* Insert name into cache if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
-#endif
 
 	return (error);
 }
@@ -1683,7 +1758,6 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
@@ -1691,10 +1765,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
-	boolean_t	have_acl = B_FALSE;
-	boolean_t	waited = B_FALSE;
 	void		*vsecp = NULL;
 	int		flag = 0;
+	uint64_t	txtype;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
@@ -1731,182 +1804,89 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 		}
 	}
 
-	getnewvnode_reserve(1);
-
-top:
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
-	if (*name == '\0') {
-		/*
-		 * Null component name refers to the directory itself.
-		 */
-		VN_HOLD(dvp);
-		zp = dzp;
-		dl = NULL;
-		error = 0;
-	} else {
-		/* possible VN_HOLD(zp) */
-		int zflg = 0;
-
-		if (flag & FIGNORECASE)
-			zflg |= ZCILOOK;
-
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-		    NULL, NULL);
-		if (error) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			if (strcmp(name, "..") == 0)
-				error = SET_ERROR(EISDIR);
-			getnewvnode_drop_reserve();
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
 	}
+	ASSERT3P(zp, ==, NULL);
 
-	if (zp == NULL) {
-		uint64_t txtype;
-
-		/*
-		 * Create a new file object and update the directory
-		 * to reference it.
-		 */
-		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			goto out;
-		}
-
-		/*
-		 * We only support the creation of regular files in
-		 * extended attribute directories.
-		 */
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+		goto out;
+	}
 
-		if ((dzp->z_pflags & ZFS_XATTR) &&
-		    (vap->va_type != VREG)) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			error = SET_ERROR(EINVAL);
-			goto out;
-		}
+	/*
+	 * We only support the creation of regular files in
+	 * extended attribute directories.
+	 */
 
-		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
-		    cr, vsecp, &acl_ids)) != 0)
-			goto out;
-		have_acl = B_TRUE;
+	if ((dzp->z_pflags & ZFS_XATTR) &&
+	    (vap->va_type != VREG)) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
 
-		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-			zfs_acl_ids_free(&acl_ids);
-			error = SET_ERROR(EDQUOT);
-			goto out;
-		}
+	if ((error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
 
-		tx = dmu_tx_create(os);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
 
-		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
-		    ZFS_SA_BASE_ATTR_SIZE);
+	getnewvnode_reserve(1);
 
-		fuid_dirtied = zfsvfs->z_fuid_dirty;
-		if (fuid_dirtied)
-			zfs_fuid_txhold(zfsvfs, tx);
-		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
-		if (!zfsvfs->z_use_sa &&
-		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, acl_ids.z_aclp->z_acl_bytes);
-		}
-		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
-		if (error) {
-			zfs_dirent_unlock(dl);
-			if (error == ERESTART) {
-				waited = B_TRUE;
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto top;
-			}
-			zfs_acl_ids_free(&acl_ids);
-			dmu_tx_abort(tx);
-			getnewvnode_drop_reserve();
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+	tx = dmu_tx_create(os);
 
-		if (fuid_dirtied)
-			zfs_fuid_sync(zfsvfs, tx);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
 
-		(void) zfs_link_create(dl, zp, tx, ZNEW);
-		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
-		if (flag & FIGNORECASE)
-			txtype |= TX_CI;
-		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
-		    vsecp, acl_ids.z_fuidp, vap);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_commit(tx);
-	} else {
-		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
-
-		if (have_acl)
-			zfs_acl_ids_free(&acl_ids);
-		have_acl = B_FALSE;
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
-		/*
-		 * A directory entry already exists for this name.
-		 */
-		/*
-		 * Can't truncate an existing file if in exclusive mode.
-		 */
-		if (excl == EXCL) {
-			error = SET_ERROR(EEXIST);
-			goto out;
-		}
-		/*
-		 * Can't open a directory for writing.
-		 */
-		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
-			error = SET_ERROR(EISDIR);
-			goto out;
-		}
-		/*
-		 * Verify requested access to file.
-		 */
-		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
-			goto out;
-		}
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
 
-		mutex_enter(&dzp->z_lock);
-		dzp->z_seq++;
-		mutex_exit(&dzp->z_lock);
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+	    vsecp, acl_ids.z_fuidp, vap);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
 
-		/*
-		 * Truncate regular files if requested.
-		 */
-		if ((ZTOV(zp)->v_type == VREG) &&
-		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
-			/* we can't hold any locks when calling zfs_freesp() */
-			zfs_dirent_unlock(dl);
-			dl = NULL;
-			error = zfs_freesp(zp, 0, 0, mode, TRUE);
-			if (error == 0) {
-				vnevent_create(ZTOV(zp), ct);
-			}
-		}
-	}
-out:
 	getnewvnode_drop_reserve();
-	if (dl)
-		zfs_dirent_unlock(dl);
 
-	if (error) {
-		if (zp)
-			VN_RELE(ZTOV(zp));
-	} else {
+out:
+	if (error == 0) {
 		*vpp = ZTOV(zp);
-		error = specvp_check(vpp, cr);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
@@ -1932,57 +1912,30 @@ out:
  *	 vp - ctime (if nlink > 0)
  */
 
-uint64_t null_xattr = 0;
-
 /*ARGSUSED*/
 static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
-    int flags)
+zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
-	znode_t		*zp, *dzp = VTOZ(dvp);
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp = VTOZ(vp);
 	znode_t		*xzp;
-	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
-	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
-	pathname_t	*realnmp = NULL;
-	pathname_t	realnm;
 	int		error;
-	int		zflg = ZEXISTS;
-	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
+	zp = VTOZ(vp);
 
-	if (flags & FIGNORECASE) {
-		zflg |= ZCILOOK;
-		pn_alloc(&realnm);
-		realnmp = &realnm;
-	}
-
-top:
 	xattr_obj = 0;
 	xzp = NULL;
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, realnmp)) {
-		if (realnmp)
-			pn_free(realnmp);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
@@ -1998,14 +1951,15 @@ top:
 
 	vnevent_remove(vp, dvp, name, ct);
 
-	if (realnmp)
-		dnlc_remove(dvp, realnmp->pn_buf);
-	else
-		dnlc_remove(dvp, name);
+	obj = zp->z_id;
 
-	VI_LOCK(vp);
-	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
-	VI_UNLOCK(vp);
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
@@ -2013,35 +1967,17 @@ top:
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
-	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	if (may_delete_now) {
-		toobig =
-		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
-		/* if the file is too big, only hold_free a token amount */
-		dmu_tx_hold_free(tx, zp->z_id, 0,
-		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
-	}
 
-	/* are there any extended attributes? */
-	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-	    &xattr_obj, sizeof (xattr_obj));
-	if (error == 0 && xattr_obj) {
-		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT0(error);
+	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
-	mutex_enter(&zp->z_lock);
-	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
-		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-	mutex_exit(&zp->z_lock);
-
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
@@ -2050,20 +1986,8 @@ top:
 	 */
 	dmu_tx_mark_netfree(tx);
 
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (xzp)
-			VN_RELE(ZTOV(xzp));
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		if (realnmp)
-			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -2072,7 +1996,7 @@ top:
 	/*
 	 * Remove the directory entry.
 	 */
-	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
@@ -2080,76 +2004,18 @@ top:
 	}
 
 	if (unlinked) {
-		/*
-		 * Hold z_lock so that we can make sure that the ACL obj
-		 * hasn't changed.  Could have been deleted due to
-		 * zfs_sa_upgrade().
-		 */
-		mutex_enter(&zp->z_lock);
-		VI_LOCK(vp);
-		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
-		delete_now = may_delete_now && !toobig &&
-		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
-		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
-		    acl_obj;
-		VI_UNLOCK(vp);
-	}
-
-	if (delete_now) {
-#ifdef __FreeBSD__
-		panic("zfs_remove: delete_now branch taken");
-#endif
-		if (xattr_obj_unlinked) {
-			ASSERT3U(xzp->z_links, ==, 2);
-			mutex_enter(&xzp->z_lock);
-			xzp->z_unlinked = 1;
-			xzp->z_links = 0;
-			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
-			    &xzp->z_links, sizeof (xzp->z_links), tx);
-			ASSERT3U(error,  ==,  0);
-			mutex_exit(&xzp->z_lock);
-			zfs_unlinked_add(xzp, tx);
-
-			if (zp->z_is_sa)
-				error = sa_remove(zp->z_sa_hdl,
-				    SA_ZPL_XATTR(zfsvfs), tx);
-			else
-				error = sa_update(zp->z_sa_hdl,
-				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
-				    sizeof (uint64_t), tx);
-			ASSERT0(error);
-		}
-		VI_LOCK(vp);
-		vp->v_count--;
-		ASSERT0(vp->v_count);
-		VI_UNLOCK(vp);
-		mutex_exit(&zp->z_lock);
-		zfs_znode_delete(zp, tx);
-	} else if (unlinked) {
-		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
-#ifdef __FreeBSD__
 		vp->v_vflag |= VV_NOSYNC;
-#endif
 	}
 
 	txtype = TX_REMOVE;
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
-	if (realnmp)
-		pn_free(realnmp);
-
-	zfs_dirent_unlock(dl);
 
-	if (!delete_now)
-		VN_RELE(vp);
 	if (xzp)
-		VN_RELE(ZTOV(xzp));
+		vrele(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
@@ -2180,23 +2046,19 @@ out:
  */
 /*ARGSUSED*/
 static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
-    caller_context_t *ct, int flags, vsecattr_t *vsecp)
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zf = ZNEW;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
-	boolean_t	waited = B_FALSE;
 
 	ASSERT(vap->va_type == VDIR);
 
@@ -2211,7 +2073,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
-	    (vsecp || (vap->va_mask & AT_XVATTR) ||
+	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
@@ -2229,8 +2091,6 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
@@ -2241,13 +2101,11 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
-	    vsecp, &acl_ids)) != 0) {
+	    NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	getnewvnode_reserve(1);
-
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
@@ -2255,29 +2113,23 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
-top:
 	*vpp = NULL;
 
-	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
-	    NULL, NULL)) {
+	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
 		zfs_acl_ids_free(&acl_ids);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
+	ASSERT3P(zp, ==, NULL);
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
@@ -2285,6 +2137,7 @@ top:
 	/*
 	 * Add a new entry to the directory.
 	 */
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
@@ -2299,15 +2152,8 @@ top:
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
@@ -2326,14 +2172,12 @@ top:
 	/*
 	 * Now put new name in parent dir.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
+	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
-	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
-	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
@@ -2342,8 +2186,6 @@ top:
 
 	getnewvnode_drop_reserve();
 
-	zfs_dirent_unlock(dl);
-
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
@@ -2370,39 +2212,20 @@ top:
  */
 /*ARGSUSED*/
 static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
-	znode_t		*zp;
-	vnode_t		*vp;
+	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zflg = ZEXISTS;
-	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-top:
-	zp = NULL;
-
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, NULL)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
@@ -2413,25 +2236,8 @@ top:
 		goto out;
 	}
 
-	if (vp == cwd) {
-		error = SET_ERROR(EINVAL);
-		goto out;
-	}
-
 	vnevent_rmdir(vp, dvp, name, ct);
 
-	/*
-	 * Grab a lock on the directory to make sure that noone is
-	 * trying to add (or lookup) entries while we are removing it.
-	 */
-	rw_enter(&zp->z_name_lock, RW_WRITER);
-
-	/*
-	 * Grab a lock on the parent pointer to make sure we play well
-	 * with the treewalk and directory rename code.
-	 */
-	rw_enter(&zp->z_parent_lock, RW_WRITER);
-
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -2439,48 +2245,26 @@ top:
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		rw_exit(&zp->z_parent_lock);
-		rw_exit(&zp->z_name_lock);
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-#ifdef FREEBSD_NAMECACHE
 	cache_purge(dvp);
-#endif
 
-	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
-	rw_exit(&zp->z_parent_lock);
-	rw_exit(&zp->z_name_lock);
-#ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
-#endif
 out:
-	zfs_dirent_unlock(dl);
-
-	VN_RELE(vp);
-
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
@@ -2705,10 +2489,10 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
-				VN_RELE(ZTOV(ezp));
+				vrele(ZTOV(ezp));
 				goto skip_entry;
 			}
-			VN_RELE(ZTOV(ezp));
+			vrele(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
@@ -2905,7 +2689,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	 * than to determine whether we were asked the question.
 	 */
 
-	mutex_enter(&zp->z_lock);
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 #ifdef illumos
@@ -3042,7 +2825,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
-	mutex_exit(&zp->z_lock);
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
@@ -3178,7 +2960,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 		}
 	}
 
-top:
 	attrzp = NULL;
 	aclp = NULL;
 
@@ -3267,7 +3048,6 @@ top:
 		}
 	}
 
-	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
@@ -3341,7 +3121,6 @@ top:
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
@@ -3353,8 +3132,6 @@ top:
 		}
 	}
 
-	mutex_exit(&zp->z_lock);
-
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
@@ -3429,7 +3206,7 @@ top:
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
-					VN_RELE(ZTOV(attrzp));
+					vrele(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
@@ -3441,7 +3218,7 @@ top:
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
-					VN_RELE(ZTOV(attrzp));
+					vrele(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
@@ -3463,7 +3240,6 @@ top:
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
-		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
@@ -3484,7 +3260,6 @@ top:
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
-		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
@@ -3517,10 +3292,8 @@ top:
 	 * updated as a side-effect of calling this function.
 	 */
 
-
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
@@ -3528,7 +3301,6 @@ top:
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
-		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
@@ -3662,14 +3434,12 @@ top:
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
-	mutex_exit(&zp->z_lock);
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
-		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
@@ -3679,7 +3449,7 @@ out:
 	}
 
 	if (attrzp)
-		VN_RELE(ZTOV(attrzp));
+		vrele(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
@@ -3691,8 +3461,6 @@ out:
 
 	if (err) {
 		dmu_tx_abort(tx);
-		if (err == ERESTART)
-			goto top;
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
@@ -3706,101 +3474,236 @@ out2:
 	return (err);
 }
 
-typedef struct zfs_zlock {
-	krwlock_t	*zl_rwlock;	/* lock we acquired */
-	znode_t		*zl_znode;	/* znode we held */
-	struct zfs_zlock *zl_next;	/* next in list */
-} zfs_zlock_t;
-
 /*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
+ * We acquire all but fdvp locks using non-blocking acquisitions.  If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename.  This acquire/release step ensures that we do not
+ * spin on a lock waiting for release.  On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
  */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+    struct vnode *tdvp, struct vnode **tvpp,
+    const struct componentname *scnp, const struct componentname *tcnp)
 {
-	zfs_zlock_t *zl;
+	zfsvfs_t	*zfsvfs;
+	struct vnode	*nvp, *svp, *tvp;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	const char	*snm = scnp->cn_nameptr;
+	const char	*tnm = tcnp->cn_nameptr;
+	int error;
+
+	VOP_UNLOCK(tdvp, 0);
+	if (*tvpp != NULL && *tvpp != tdvp)
+		VOP_UNLOCK(*tvpp, 0);
 
-	while ((zl = *zlpp) != NULL) {
-		if (zl->zl_znode != NULL)
-			VN_RELE(ZTOV(zl->zl_znode));
-		rw_exit(zl->zl_rwlock);
-		*zlpp = zl->zl_next;
-		kmem_free(zl, sizeof (*zl));
+relock:
+	error = vn_lock(sdvp, LK_EXCLUSIVE);
+	if (error)
+		goto out;
+	sdzp = VTOZ(sdvp);
+
+	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+		if (error != EBUSY)
+			goto out;
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto out;
+		VOP_UNLOCK(tdvp, 0);
+		goto relock;
 	}
-}
+	tdzp = VTOZ(tdvp);
 
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
-	zfs_zlock_t	*zl;
-	znode_t		*zp = tdzp;
-	uint64_t	rootid = zp->z_zfsvfs->z_root;
-	uint64_t	oidp = zp->z_id;
-	krwlock_t	*rwlp = &szp->z_parent_lock;
-	krw_t		rw = RW_WRITER;
+	/*
+	 * Before using sdzp and tdzp we must ensure that they are live.
+	 * As a porting legacy from illumos we have two things to worry
+	 * about.  One is typical for FreeBSD and it is that the vnode is
+	 * not reclaimed (doomed).  The other is that the znode is live.
+	 * The current code can invalidate the znode without acquiring the
+	 * corresponding vnode lock if the object represented by the znode
+	 * and vnode is no longer valid after a rollback or receive operation.
+	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+	 * that protects the znodes from the invalidation.
+	 */
+	zfsvfs = sdzp->z_zfsvfs;
+	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+	ZFS_ENTER(zfsvfs);
 
 	/*
-	 * First pass write-locks szp and compares to zp->z_id.
-	 * Later passes read-lock zp and compare to zp->z_parent.
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
 	 */
-	do {
-		if (!rw_tryenter(rwlp, rw)) {
-			/*
-			 * Another thread is renaming in this path.
-			 * Note that if we are a WRITER, we don't have any
-			 * parent_locks held yet.
-			 */
-			if (rw == RW_READER && zp->z_id > szp->z_id) {
-				/*
-				 * Drop our locks and restart
-				 */
-				zfs_rename_unlock(&zl);
-				*zlpp = NULL;
-				zp = tdzp;
-				oidp = zp->z_id;
-				rwlp = &szp->z_parent_lock;
-				rw = RW_WRITER;
-				continue;
-			} else {
-				/*
-				 * Wait for other thread to drop its locks
-				 */
-				rw_enter(rwlp, rw);
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		error = SET_ERROR(EIO);
+		goto out;
+	}
+
+	/*
+	 * Re-resolve svp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+	if (error != 0) {
+		/* Source entry invalid or not there. */
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	svp = ZTOV(szp);
+
+	/*
+	 * Re-resolve tvp, if it disappeared we just carry on.
+	 */
+	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		vrele(svp);
+		if ((tcnp->cn_flags & ISDOTDOT) != 0)
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	if (tzp != NULL)
+		tvp = ZTOV(tzp);
+	else
+		tvp = NULL;
+
+	/*
+	 * At present the vnode locks must be acquired before z_teardown_lock,
+	 * although it would be more logical to use the opposite order.
+	 */
+	ZFS_EXIT(zfsvfs);
+
+	/*
+	 * Now try acquire locks on svp and tvp.
+	 */
+	nvp = svp;
+	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if (tvp != NULL)
+			vrele(tvp);
+		if (error != EBUSY) {
+			vrele(nvp);
+			goto out;
+		}
+		error = vn_lock(nvp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vrele(nvp);
+			goto out;
+		}
+		VOP_UNLOCK(nvp, 0);
+		/*
+		 * Concurrent rename race.
+		 * XXX ?
+		 */
+		if (nvp == tdvp) {
+			vrele(nvp);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+		vrele(*svpp);
+		*svpp = nvp;
+		goto relock;
+	}
+	vrele(*svpp);
+	*svpp = nvp;
+
+	if (*tvpp != NULL)
+		vrele(*tvpp);
+	*tvpp = NULL;
+	if (tvp != NULL) {
+		nvp = tvp;
+		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error != 0) {
+			VOP_UNLOCK(sdvp, 0);
+			VOP_UNLOCK(tdvp, 0);
+			VOP_UNLOCK(*svpp, 0);
+			if (error != EBUSY) {
+				vrele(nvp);
+				goto out;
+			}
+			error = vn_lock(nvp, LK_EXCLUSIVE);
+			if (error != 0) {
+				vrele(nvp);
+				goto out;
 			}
+			vput(nvp);
+			goto relock;
 		}
+		*tvpp = nvp;
+	}
 
-		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
-		zl->zl_rwlock = rwlp;
-		zl->zl_znode = NULL;
-		zl->zl_next = *zlpp;
-		*zlpp = zl;
+	return (0);
 
-		if (oidp == szp->z_id)		/* We're a descendant of szp */
-			return (SET_ERROR(EINVAL));
+out:
+	return (error);
+}
 
-		if (oidp == rootid)		/* We've hit the top */
-			return (0);
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*zp, *zp1;
+	uint64_t	parent;
+	int		error;
 
-		if (rw == RW_READER) {		/* i.e. not the first pass */
-			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
-			if (error)
-				return (error);
-			zl->zl_znode = zp;
+	zfsvfs = tdzp->z_zfsvfs;
+	if (tdzp == szp)
+		return (SET_ERROR(EINVAL));
+	if (tdzp == sdzp)
+		return (0);
+	if (tdzp->z_id == zfsvfs->z_root)
+		return (0);
+	zp = tdzp;
+	for (;;) {
+		ASSERT(!zp->z_unlinked);
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			break;
+
+		if (parent == szp->z_id) {
+			error = SET_ERROR(EINVAL);
+			break;
 		}
-		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
-		    &oidp, sizeof (oidp));
-		rwlp = &zp->z_parent_lock;
-		rw = RW_READER;
+		if (parent == zfsvfs->z_root)
+			break;
+		if (parent == sdzp->z_id)
+			break;
 
-	} while (zp->z_id != sdzp->z_id);
+		error = zfs_zget(zfsvfs, parent, &zp1);
+		if (error != 0)
+			break;
 
-	return (0);
+		if (zp != tdzp)
+			VN_RELE_ASYNC(ZTOV(zp),
+			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+		zp = zp1;
+	}
+
+	if (error == ENOTDIR)
+		panic("checkpath: .. not a directory\n");
+	if (zp != tdzp)
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+	return (error);
 }
 
 /*
@@ -3822,187 +3725,93 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
  */
 /*ARGSUSED*/
 static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+    cred_t *cr)
 {
-	znode_t		*tdzp, *sdzp, *szp, *tzp;
-	zfsvfs_t 	*zfsvfs;
-	zilog_t		*zilog;
-	vnode_t		*realvp;
-	zfs_dirlock_t	*sdl, *tdl;
+	zfsvfs_t	*zfsvfs;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	zilog_t		*zilog = NULL;
 	dmu_tx_t	*tx;
-	zfs_zlock_t	*zl;
-	int		cmp, serr, terr;
+	char		*snm = scnp->cn_nameptr;
+	char		*tnm = tcnp->cn_nameptr;
 	int		error = 0;
-	int		zflg = 0;
-	boolean_t	waited = B_FALSE;
 
-	tdzp = VTOZ(tdvp);
-	ZFS_VERIFY_ZP(tdzp);
-	zfsvfs = tdzp->z_zfsvfs;
-	ZFS_ENTER(zfsvfs);
-	zilog = zfsvfs->z_log;
-	sdzp = VTOZ(sdvp);
+	/* Reject renames across filesystems. */
+	if ((*svpp)->v_mount != tdvp->v_mount ||
+	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	if (zfsctl_is_node(tdvp)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
 
 	/*
-	 * In case sdzp is not valid, let's be sure to exit from the right
-	 * zfsvfs_t.
+	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
-	if (sdzp->z_sa_hdl == NULL) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EIO));
+	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+	if (error != 0) {
+		/* no vnodes are locked in the case of error here */
+		return (error);
 	}
 
+	tdzp = VTOZ(tdvp);
+	sdzp = VTOZ(sdvp);
+	zfsvfs = tdzp->z_zfsvfs;
+	zilog = zfsvfs->z_log;
+
 	/*
-	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
-	 * ctldir appear to have the same v_vfsp.
+	 * After we re-enter ZFS_ENTER() we will have to revalidate all
+	 * znodes involved.
 	 */
-	if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EXDEV));
-	}
+	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
+		error = SET_ERROR(EILSEQ);
+		goto unlockout;
 	}
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-
-top:
-	szp = NULL;
-	tzp = NULL;
-	zl = NULL;
-
-	/*
-	 * This is to prevent the creation of links into attribute space
-	 * by renaming a linked file into/outof an attribute directory.
-	 * See the comment in zfs_link() for why this is considered bad.
-	 */
-	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
+	/* If source and target are the same file, there is nothing to do. */
+	if ((*svpp) == (*tvpp)) {
+		error = 0;
+		goto unlockout;
 	}
 
-	/*
-	 * Lock source and target directory entries.  To prevent deadlock,
-	 * a lock ordering must be defined.  We lock the directory with
-	 * the smallest object id first, or if it's a tie, the one with
-	 * the lexically first name.
-	 */
-	if (sdzp->z_id < tdzp->z_id) {
-		cmp = -1;
-	} else if (sdzp->z_id > tdzp->z_id) {
-		cmp = 1;
-	} else {
-		/*
-		 * First compare the two name arguments without
-		 * considering any case folding.
-		 */
-		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
-
-		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
-		ASSERT(error == 0 || !zfsvfs->z_utf8);
-		if (cmp == 0) {
-			/*
-			 * POSIX: "If the old argument and the new argument
-			 * both refer to links to the same existing file,
-			 * the rename() function shall return successfully
-			 * and perform no other action."
-			 */
-			ZFS_EXIT(zfsvfs);
-			return (0);
-		}
-		/*
-		 * If the file system is case-folding, then we may
-		 * have some more checking to do.  A case-folding file
-		 * system is either supporting mixed case sensitivity
-		 * access or is completely case-insensitive.  Note
-		 * that the file system is always case preserving.
-		 *
-		 * In mixed sensitivity mode case sensitive behavior
-		 * is the default.  FIGNORECASE must be used to
-		 * explicitly request case insensitive behavior.
-		 *
-		 * If the source and target names provided differ only
-		 * by case (e.g., a request to rename 'tim' to 'Tim'),
-		 * we will treat this as a special case in the
-		 * case-insensitive mode: as long as the source name
-		 * is an exact match, we will allow this to proceed as
-		 * a name-change request.
-		 */
-		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
-		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
-		    flags & FIGNORECASE)) &&
-		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
-		    &error) == 0) {
-			/*
-			 * case preserving rename request, require exact
-			 * name matches
-			 */
-			zflg |= ZCIEXACT;
-			zflg &= ~ZCILOOK;
-		}
+	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+	    (*tvpp)->v_mountedhere != NULL)) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
 	}
 
 	/*
-	 * If the source and destination directories are the same, we should
-	 * grab the z_name_lock of that directory only once.
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
 	 */
-	if (sdzp == tdzp) {
-		zflg |= ZHAVELOCK;
-		rw_enter(&sdzp->z_name_lock, RW_READER);
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
 	}
 
-	if (cmp < 0) {
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
-		    ZEXISTS | zflg, NULL, NULL);
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
-	} else {
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, zflg, NULL, NULL);
-		serr = zfs_dirent_lock(&sdl,
-		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
-		    NULL, NULL);
+	szp = VTOZ(*svpp);
+	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
 	}
 
-	if (serr) {
-		/*
-		 * Source entry invalid or not there.
-		 */
-		if (!terr) {
-			zfs_dirent_unlock(tdl);
-			if (tzp)
-				VN_RELE(ZTOV(tzp));
-		}
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		/*
-		 * FreeBSD: In OpenSolaris they only check if rename source is
-		 * ".." here, because "." is handled in their lookup. This is
-		 * not the case for FreeBSD, so we check for "." explicitly.
-		 */
-		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
-			serr = SET_ERROR(EINVAL);
-		ZFS_EXIT(zfsvfs);
-		return (serr);
-	}
-	if (terr) {
-		zfs_dirent_unlock(sdl);
-		VN_RELE(ZTOV(szp));
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		if (strcmp(tnm, "..") == 0)
-			terr = SET_ERROR(EINVAL);
-		ZFS_EXIT(zfsvfs);
-		return (terr);
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		error = SET_ERROR(EINVAL);
+		goto unlockout;
 	}
 
 	/*
@@ -4011,17 +3820,26 @@ top:
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
-
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
-		goto out;
+		goto unlockout;
+
+	if ((*svpp)->v_type == VDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+		    sdzp == szp ||
+		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+			error = EINVAL;
+			goto unlockout;
+		}
 
-	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
-		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
-			goto out;
+		if (error = zfs_rename_check(szp, sdzp, tdzp))
+			goto unlockout;
 	}
 
 	/*
@@ -4031,31 +3849,26 @@ top:
 		/*
 		 * Source and target must be the same type.
 		 */
-		if (ZTOV(szp)->v_type == VDIR) {
-			if (ZTOV(tzp)->v_type != VDIR) {
+		if ((*svpp)->v_type == VDIR) {
+			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
-				goto out;
+				goto unlockout;
+			} else {
+				cache_purge(tdvp);
+				if (sdvp != tdvp)
+					cache_purge(sdvp);
 			}
 		} else {
-			if (ZTOV(tzp)->v_type == VDIR) {
+			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
-				goto out;
+				goto unlockout;
 			}
 		}
-		/*
-		 * POSIX dictates that when the source and target
-		 * entries refer to the same file object, rename
-		 * must do nothing and exit without error.
-		 */
-		if (szp->z_id == tzp->z_id) {
-			error = 0;
-			goto out;
-		}
 	}
 
-	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
-		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
@@ -4081,35 +3894,18 @@ top:
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (zl != NULL)
-			zfs_rename_unlock(&zl);
-		zfs_dirent_unlock(sdl);
-		zfs_dirent_unlock(tdl);
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		VN_RELE(ZTOV(szp));
-		if (tzp)
-			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
+		goto unlockout;
 	}
 
+
 	if (tzp)	/* Attempt to remove the existing target */
-		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
-		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
@@ -4117,17 +3913,16 @@ top:
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
-			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+			    NULL);
 			if (error == 0) {
-				zfs_log_rename(zilog, tx, TX_RENAME |
-				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
-				    sdl->dl_name, tdzp, tdl->dl_name, szp);
+				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+				    snm, tdzp, tnm, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
-				vn_renamepath(tdvp, ZTOV(szp), tnm,
-				    strlen(tnm));
+				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
@@ -4141,42 +3936,33 @@ top:
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
-				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
-#ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
-			cache_purge(sdvp);
-			cache_purge(tdvp);
-			cache_purge(ZTOV(szp));
-			if (tzp)
-				cache_purge(ZTOV(tzp));
+			cache_purge(*svpp);
+			if (*tvpp != NULL)
+				cache_purge(*tvpp);
+			cache_purge_negative(tdvp);
 		}
-#endif
 	}
 
 	dmu_tx_commit(tx);
-out:
-	if (zl != NULL)
-		zfs_rename_unlock(&zl);
 
-	zfs_dirent_unlock(sdl);
-	zfs_dirent_unlock(tdl);
-
-	if (sdzp == tdzp)
-		rw_exit(&sdzp->z_name_lock);
-
-
-	VN_RELE(ZTOV(szp));
-	if (tzp)
-		VN_RELE(ZTOV(tzp));
+unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
+	ZFS_EXIT(zfsvfs);
+	VOP_UNLOCK(*svpp, 0);
+	VOP_UNLOCK(sdvp, 0);
 
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+out:				/* original two vnodes are locked */
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
-	ZFS_EXIT(zfsvfs);
-
+	if (*tvpp != NULL)
+		VOP_UNLOCK(*tvpp, 0);
+	if (tdvp != *tvpp)
+		VOP_UNLOCK(tdvp, 0);
 	return (error);
 }
 
@@ -4201,17 +3987,14 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
-	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
-	boolean_t	waited = B_FALSE;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
@@ -4225,8 +4008,6 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
@@ -4239,35 +4020,29 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
 		return (error);
 	}
 
-	getnewvnode_reserve(1);
-
-top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
+
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
@@ -4281,15 +4056,8 @@ top:
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
@@ -4306,13 +4074,11 @@ top:
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
-	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
@@ -4320,10 +4086,8 @@ top:
 	/*
 	 * Insert the new object into the directory.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
@@ -4333,8 +4097,6 @@ top:
 
 	getnewvnode_drop_reserve();
 
-	zfs_dirent_unlock(dl);
-
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
@@ -4369,13 +4131,11 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
-	mutex_exit(&zp->z_lock);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
@@ -4407,14 +4167,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	vnode_t		*realvp;
 	int		error;
-	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
-	boolean_t	waited = B_FALSE;
 
 	ASSERT(tdvp->v_type == VDIR);
 
@@ -4422,9 +4178,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
-	if (VOP_REALVP(svp, &realvp, ct) == 0)
-		svp = realvp;
-
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
@@ -4442,15 +4195,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 		return (SET_ERROR(EPERM));
 	}
 
-	/*
-	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
-	 * ctldir appear to have the same v_vfsp.
-	 */
-	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EXDEV));
-	}
-
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
@@ -4468,8 +4212,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
@@ -4494,11 +4236,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 		return (error);
 	}
 
-top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -4509,33 +4250,22 @@ top:
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			waited = B_TRUE;
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	error = zfs_link_create(dl, szp, tx, 0);
+	error = zfs_link_create(dzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
-
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
@@ -4547,235 +4277,6 @@ top:
 	return (error);
 }
 
-#ifdef illumos
-/*
- * zfs_null_putapage() is used when the file system has been force
- * unmounted. It just drops the pages.
- */
-/* ARGSUSED */
-static int
-zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
-    size_t *lenp, int flags, cred_t *cr)
-{
-	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
-	return (0);
-}
-
-/*
- * Push a page out to disk, klustering if possible.
- *
- *	IN:	vp	- file to push page to.
- *		pp	- page to push.
- *		flags	- additional flags.
- *		cr	- credentials of caller.
- *
- *	OUT:	offp	- start of range pushed.
- *		lenp	- len of range pushed.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * NOTE: callers must have locked the page to be pushed.  On
- * exit, the page (and all other pages in the kluster) must be
- * unlocked.
- */
-/* ARGSUSED */
-static int
-zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
-    size_t *lenp, int flags, cred_t *cr)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	dmu_tx_t	*tx;
-	u_offset_t	off, koff;
-	size_t		len, klen;
-	int		err;
-
-	off = pp->p_offset;
-	len = PAGESIZE;
-	/*
-	 * If our blocksize is bigger than the page size, try to kluster
-	 * multiple pages so that we write a full block (thus avoiding
-	 * a read-modify-write).
-	 */
-	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
-		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
-		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
-		ASSERT(koff <= zp->z_size);
-		if (koff + klen > zp->z_size)
-			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
-		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
-	}
-	ASSERT3U(btop(len), ==, btopr(len));
-
-	/*
-	 * Can't push pages past end-of-file.
-	 */
-	if (off >= zp->z_size) {
-		/* ignore all pages */
-		err = 0;
-		goto out;
-	} else if (off + len > zp->z_size) {
-		int npages = btopr(zp->z_size - off);
-		page_t *trunc;
-
-		page_list_break(&pp, &trunc, npages);
-		/* ignore pages past end of file */
-		if (trunc)
-			pvn_write_done(trunc, flags);
-		len = zp->z_size - off;
-	}
-
-	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
-	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
-		err = SET_ERROR(EDQUOT);
-		goto out;
-	}
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_write(tx, zp->z_id, off, len);
-
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		goto out;
-	}
-
-	if (zp->z_blksz <= PAGESIZE) {
-		caddr_t va = zfs_map_page(pp, S_READ);
-		ASSERT3U(len, <=, PAGESIZE);
-		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
-		zfs_unmap_page(pp, va);
-	} else {
-		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
-	}
-
-	if (err == 0) {
-		uint64_t mtime[2], ctime[2];
-		sa_bulk_attr_t bulk[3];
-		int count = 0;
-
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
-		    &mtime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-		    &ctime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-		    &zp->z_pflags, 8);
-		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
-		    B_TRUE);
-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
-	}
-	dmu_tx_commit(tx);
-
-out:
-	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
-	if (offp)
-		*offp = off;
-	if (lenp)
-		*lenp = len;
-
-	return (err);
-}
-
-/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
- *
- *	IN:	vp	- vnode of file to push page data to.
- *		off	- position in file to put data.
- *		len	- amount of data to write.
- *		flags	- flags to control the operation.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		*pp;
-	size_t		io_len;
-	u_offset_t	io_off;
-	uint_t		blksz;
-	rl_t		*rl;
-	int		error = 0;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * Align this request to the file block size in case we kluster.
-	 * XXX - this can result in pretty aggresive locking, which can
-	 * impact simultanious read/write access.  One option might be
-	 * to break up long requests (len == 0) into block-by-block
-	 * operations to get narrower locking.
-	 */
-	blksz = zp->z_blksz;
-	if (ISP2(blksz))
-		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
-	else
-		io_off = 0;
-	if (len > 0 && ISP2(blksz))
-		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
-	else
-		io_len = 0;
-
-	if (io_len == 0) {
-		/*
-		 * Search the entire vp list for pages >= io_off.
-		 */
-		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
-		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
-		goto out;
-	}
-	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
-
-	if (off > zp->z_size) {
-		/* past end of file */
-		zfs_range_unlock(rl);
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
-
-	for (off = io_off; io_off < off + len; io_off += io_len) {
-		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
-			pp = page_lookup(vp, io_off,
-			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
-		} else {
-			pp = page_lookup_nowait(vp, io_off,
-			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
-		}
-
-		if (pp != NULL && pvn_getdirty(pp, flags)) {
-			int err;
-
-			/*
-			 * Found a dirty page to push
-			 */
-			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
-			if (err)
-				error = err;
-		} else {
-			io_len = PAGESIZE;
-		}
-	}
-out:
-	zfs_range_unlock(rl);
-	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zfsvfs->z_log, zp->z_id);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-#endif	/* illumos */
 
 /*ARGSUSED*/
 void
@@ -4796,17 +4297,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 		return;
 	}
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
-		mutex_exit(&zp->z_lock);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
-	mutex_exit(&zp->z_lock);
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
@@ -4817,444 +4315,15 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
-			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
-#ifdef illumos
-/*
- * Bounds-check the seek operation.
- *
- *	IN:	vp	- vnode seeking within
- *		ooff	- old file offset
- *		noffp	- pointer to new file offset
- *		ct	- caller context
- *
- *	RETURN:	0 on success, EINVAL if new offset invalid.
- */
-/* ARGSUSED */
-static int
-zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
-    caller_context_t *ct)
-{
-	if (vp->v_type == VDIR)
-		return (0);
-	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
-
-/*
- * Pre-filter the generic locking function to trap attempts to place
- * a mandatory lock on a memory mapped file.
- */
-static int
-zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
-    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * We are following the UFS semantics with respect to mapcnt
-	 * here: If we see that the file is mapped already, then we will
-	 * return an error, but we don't worry about races between this
-	 * function and zfs_map().
-	 */
-	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EAGAIN));
-	}
-	ZFS_EXIT(zfsvfs);
-	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
-}
-
-/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data.  For efficiency, we may try to fill
- * multiple pages at once (klustering) to fill up the supplied page
- * list.  Note that the pages to be filled are held with an exclusive
- * lock to prevent access by other threads while they are being filled.
- */
-static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
-    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
-{
-	znode_t *zp = VTOZ(vp);
-	page_t *pp, *cur_pp;
-	objset_t *os = zp->z_zfsvfs->z_os;
-	u_offset_t io_off, total;
-	size_t io_len;
-	int err;
-
-	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
-		/*
-		 * We only have a single page, don't bother klustering
-		 */
-		io_off = off;
-		io_len = PAGESIZE;
-		pp = page_create_va(vp, io_off, io_len,
-		    PG_EXCL | PG_WAIT, seg, addr);
-	} else {
-		/*
-		 * Try to find enough pages to fill the page list
-		 */
-		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
-		    &io_len, off, plsz, 0);
-	}
-	if (pp == NULL) {
-		/*
-		 * The page already exists, nothing to do here.
-		 */
-		*pl = NULL;
-		return (0);
-	}
-
-	/*
-	 * Fill the pages in the kluster.
-	 */
-	cur_pp = pp;
-	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
-		caddr_t va;
-
-		ASSERT3U(io_off, ==, cur_pp->p_offset);
-		va = zfs_map_page(cur_pp, S_WRITE);
-		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
-		    DMU_READ_PREFETCH);
-		zfs_unmap_page(cur_pp, va);
-		if (err) {
-			/* On error, toss the entire kluster */
-			pvn_read_done(pp, B_ERROR);
-			/* convert checksum errors into IO errors */
-			if (err == ECKSUM)
-				err = SET_ERROR(EIO);
-			return (err);
-		}
-		cur_pp = cur_pp->p_next;
-	}
-
-	/*
-	 * Fill in the page list array from the kluster starting
-	 * from the desired offset `off'.
-	 * NOTE: the page list will always be null terminated.
-	 */
-	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
-	ASSERT(pl == NULL || (*pl)->p_offset == off);
-
-	return (0);
-}
-
-/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array.  If plsz is greater than len, this function may
- * also return page pointers from after the specified region
- * (i.e. the region [off, off + plsz]).  These additional pages are
- * only returned if they are already in the cache, or were created as
- * part of a klustered read.
- *
- *	IN:	vp	- vnode of file to get data from.
- *		off	- position in file to get data from.
- *		len	- amount of data to retrieve.
- *		plsz	- length of provided page list.
- *		seg	- segment to obtain pages for.
- *		addr	- virtual address of fault.
- *		rw	- mode of created pages.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	OUT:	protp	- protection mode of created pages.
- *		pl	- list of pages created.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
-    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
-    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		**pl0 = pl;
-	int		err = 0;
-
-	/* we do our own caching, faultahead is unnecessary */
-	if (pl == NULL)
-		return (0);
-	else if (len > plsz)
-		len = plsz;
-	else
-		len = P2ROUNDUP(len, PAGESIZE);
-	ASSERT(plsz >= len);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (protp)
-		*protp = PROT_ALL;
-
-	/*
-	 * Loop through the requested range [off, off + len) looking
-	 * for pages.  If we don't find a page, we will need to create
-	 * a new page and fill it with data from the file.
-	 */
-	while (len > 0) {
-		if (*pl = page_lookup(vp, off, SE_SHARED))
-			*(pl+1) = NULL;
-		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
-			goto out;
-		while (*pl) {
-			ASSERT3U((*pl)->p_offset, ==, off);
-			off += PAGESIZE;
-			addr += PAGESIZE;
-			if (len > 0) {
-				ASSERT3U(len, >=, PAGESIZE);
-				len -= PAGESIZE;
-			}
-			ASSERT3U(plsz, >=, PAGESIZE);
-			plsz -= PAGESIZE;
-			pl++;
-		}
-	}
-
-	/*
-	 * Fill out the page array with any pages already in the cache.
-	 */
-	while (plsz > 0 &&
-	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
-			off += PAGESIZE;
-			plsz -= PAGESIZE;
-	}
-out:
-	if (err) {
-		/*
-		 * Release any pages we have previously locked.
-		 */
-		while (pl > pl0)
-			page_unlock(*--pl);
-	} else {
-		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	}
-
-	*pl = NULL;
-
-	ZFS_EXIT(zfsvfs);
-	return (err);
-}
-
-/*
- * Request a memory map for a section of a file.  This code interacts
- * with common code and the VM system as follows:
- *
- * - common code calls mmap(), which ends up in smmap_common()
- * - this calls VOP_MAP(), which takes you into (say) zfs
- * - zfs_map() calls as_map(), passing segvn_create() as the callback
- * - segvn_create() creates the new segment and calls VOP_ADDMAP()
- * - zfs_addmap() updates z_mapcnt
- */
-/*ARGSUSED*/
-static int
-zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	segvn_crargs_t	vn_a;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if ((prot & PROT_WRITE) && (zp->z_pflags &
-	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	if ((prot & (PROT_READ | PROT_EXEC)) &&
-	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EACCES));
-	}
-
-	if (vp->v_flag & VNOMAP) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOSYS));
-	}
-
-	if (off < 0 || len > MAXOFFSET_T - off) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (vp->v_type != VREG) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENODEV));
-	}
-
-	/*
-	 * If file is locked, disallow mapping.
-	 */
-	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EAGAIN));
-	}
-
-	as_rangelock(as);
-	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
-	if (error != 0) {
-		as_rangeunlock(as);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vn_a.vp = vp;
-	vn_a.offset = (u_offset_t)off;
-	vn_a.type = flags & MAP_TYPE;
-	vn_a.prot = prot;
-	vn_a.maxprot = maxprot;
-	vn_a.cred = cr;
-	vn_a.amp = NULL;
-	vn_a.flags = flags & ~MAP_TYPE;
-	vn_a.szc = 0;
-	vn_a.lgrp_mem_policy_flags = 0;
-
-	error = as_map(as, *addrp, len, segvn_create, &vn_a);
-
-	as_rangeunlock(as);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
-	return (0);
-}
-
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file.  Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics.  If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed.  The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- *	open()
- *	mmap()
- *	<modify memory>
- *	munmap()
- *	close()
- *	<time lapse>
- *	putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future.  In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
-/* ARGSUSED */
-static int
-zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
-
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
-	return (0);
-}
-
-/*
- * Free or allocate space in a file.  Currently, this function only
- * supports the `F_FREESP' command.  However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- *	IN:	vp	- vnode of file to free data in.
- *		cmd	- action to take (only F_FREESP supported).
- *		bfp	- section of file to free/alloc.
- *		flag	- current file open mode flags.
- *		offset	- current file offset.
- *		cr	- credentials of caller [UNUSED].
- *		ct	- caller context.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
-/* ARGSUSED */
-static int
-zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
-    offset_t offset, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint64_t	off, len;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (cmd != F_FREESP) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
-	 * callers might not be able to detect properly that we are read-only,
-	 * so check it explicitly here.
-	 */
-	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EROFS));
-	}
-
-	if (error = convoff(vp, bfp, 0, offset)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (bfp->l_len < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	off = bfp->l_start;
-	len = bfp->l_len; /* 0 means from off to end of file */
-
-	error = zfs_freesp(zp, off, len, flag, TRUE);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-#endif	/* illumos */
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
@@ -5331,7 +4400,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
-	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
@@ -5349,13 +4417,12 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
-		error = zfs_dirent_lock(&dl, zp, "", &xzp,
-		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
+		error = zfs_dirent_lookup(zp, "", &xzp,
+		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
-			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
-			VN_RELE(ZTOV(xzp));
+			vrele(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
@@ -5448,339 +4515,6 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
 	return (error);
 }
 
-#ifdef illumos
-/*
- * The smallest read we may consider to loan out an arcbuf.
- * This must be a power of 2.
- */
-int zcr_blksz_min = (1 << 10);	/* 1K */
-/*
- * If set to less than the file block size, allow loaning out of an
- * arcbuf for a partial block read.  This must be a power of 2.
- */
-int zcr_blksz_max = (1 << 17);	/* 128K */
-
-/*ARGSUSED*/
-static int
-zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int max_blksz = zfsvfs->z_max_blksz;
-	uio_t *uio = &xuio->xu_uio;
-	ssize_t size = uio->uio_resid;
-	offset_t offset = uio->uio_loffset;
-	int blksz;
-	int fullblk, i;
-	arc_buf_t *abuf;
-	ssize_t maxsize;
-	int preamble, postamble;
-
-	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
-		return (SET_ERROR(EINVAL));
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-	switch (ioflag) {
-	case UIO_WRITE:
-		/*
-		 * Loan out an arc_buf for write if write size is bigger than
-		 * max_blksz, and the file's block size is also max_blksz.
-		 */
-		blksz = max_blksz;
-		if (size < blksz || zp->z_blksz != blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-		/*
-		 * Caller requests buffers for write before knowing where the
-		 * write offset might be (e.g. NFS TCP write).
-		 */
-		if (offset == -1) {
-			preamble = 0;
-		} else {
-			preamble = P2PHASE(offset, blksz);
-			if (preamble) {
-				preamble = blksz - preamble;
-				size -= preamble;
-			}
-		}
-
-		postamble = P2PHASE(size, blksz);
-		size -= postamble;
-
-		fullblk = size / blksz;
-		(void) dmu_xuio_init(xuio,
-		    (preamble != 0) + fullblk + (postamble != 0));
-		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
-		    int, postamble, int,
-		    (preamble != 0) + fullblk + (postamble != 0));
-
-		/*
-		 * Have to fix iov base/len for partial buffers.  They
-		 * currently represent full arc_buf's.
-		 */
-		if (preamble) {
-			/* data begins in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf,
-			    blksz - preamble, preamble);
-		}
-
-		for (i = 0; i < fullblk; i++) {
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
-		}
-
-		if (postamble) {
-			/* data ends in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
-		}
-		break;
-	case UIO_READ:
-		/*
-		 * Loan out an arc_buf for read if the read size is larger than
-		 * the current file block size.  Block alignment is not
-		 * considered.  Partial arc_buf will be loaned out for read.
-		 */
-		blksz = zp->z_blksz;
-		if (blksz < zcr_blksz_min)
-			blksz = zcr_blksz_min;
-		if (blksz > zcr_blksz_max)
-			blksz = zcr_blksz_max;
-		/* avoid potential complexity of dealing with it */
-		if (blksz > max_blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-
-		maxsize = zp->z_size - uio->uio_loffset;
-		if (size > maxsize)
-			size = maxsize;
-
-		if (size < blksz || vn_has_cached_data(vp)) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-		break;
-	default:
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	uio->uio_extflg = UIO_XUIO;
-	XUIO_XUZC_RW(xuio) = ioflag;
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
-{
-	int i;
-	arc_buf_t *abuf;
-	int ioflag = XUIO_XUZC_RW(xuio);
-
-	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
-	i = dmu_xuio_cnt(xuio);
-	while (i-- > 0) {
-		abuf = dmu_xuio_arcbuf(xuio, i);
-		/*
-		 * if abuf == NULL, it must be a write buffer
-		 * that has been returned in zfs_write().
-		 */
-		if (abuf)
-			dmu_return_arcbuf(abuf);
-		ASSERT(abuf || ioflag == UIO_WRITE);
-	}
-
-	dmu_xuio_fini(xuio);
-	return (0);
-}
-
-/*
- * Predeclare these here so that the compiler assumes that
- * this is an "old style" function declaration that does
- * not include arguments => we won't get type mismatch errors
- * in the initializations that follow.
- */
-static int zfs_inval();
-static int zfs_isdir();
-
-static int
-zfs_inval()
-{
-	return (SET_ERROR(EINVAL));
-}
-
-static int
-zfs_isdir()
-{
-	return (SET_ERROR(EISDIR));
-}
-/*
- * Directory vnode operations template
- */
-vnodeops_t *zfs_dvnodeops;
-const fs_operation_def_t zfs_dvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_READ,		{ .error = zfs_isdir },
-	VOPNAME_WRITE,		{ .error = zfs_isdir },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_CREATE,		{ .vop_create = zfs_create },
-	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
-	VOPNAME_LINK,		{ .vop_link = zfs_link },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
-	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
-	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
-	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Regular file vnode operations template
- */
-vnodeops_t *zfs_fvnodeops;
-const fs_operation_def_t zfs_fvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_READ,		{ .vop_read = zfs_read },
-	VOPNAME_WRITE,		{ .vop_write = zfs_write },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
-	VOPNAME_SPACE,		{ .vop_space = zfs_space },
-	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
-	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
-	VOPNAME_MAP,		{ .vop_map = zfs_map },
-	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
-	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
-	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
-	NULL,			NULL
-};
-
-/*
- * Symbolic link vnode operations template
- */
-vnodeops_t *zfs_symvnodeops;
-const fs_operation_def_t zfs_symvnodeops_template[] = {
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * special share hidden files vnode operations template
- */
-vnodeops_t *zfs_sharevnodeops;
-const fs_operation_def_t zfs_sharevnodeops_template[] = {
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Extended attribute directory vnode operations template
- *
- * This template is identical to the directory vnodes
- * operation template except for restricted operations:
- *	VOP_MKDIR()
- *	VOP_SYMLINK()
- *
- * Note that there are other restrictions embedded in:
- *	zfs_create()	- restrict type to VREG
- *	zfs_link()	- no links into/out of attribute space
- *	zfs_rename()	- no moves into/out of attribute space
- */
-vnodeops_t *zfs_xdvnodeops;
-const fs_operation_def_t zfs_xdvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_CREATE,		{ .vop_create = zfs_create },
-	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
-	VOPNAME_LINK,		{ .vop_link = zfs_link },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_MKDIR,		{ .error = zfs_inval },
-	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
-	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
-	VOPNAME_SYMLINK,	{ .error = zfs_inval },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Error vnode operations template
- */
-vnodeops_t *zfs_evnodeops;
-const fs_operation_def_t zfs_evnodeops_template[] = {
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	NULL,			NULL
-};
-#endif	/* illumos */
-
 static int
 ioflags(int ioflags)
 {
@@ -5789,7 +4523,7 @@ ioflags(int ioflags)
 	if (ioflags & IO_APPEND)
 		flags |= FAPPEND;
 	if (ioflags & IO_NDELAY)
-        	flags |= FNONBLOCK;
+		flags |= FNONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= (FSYNC | FDSYNC | FRSYNC);
 
@@ -6257,6 +4991,23 @@ zfs_freebsd_lookup(ap)
 }
 
 static int
+zfs_cache_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	zfsvfs_t *zfsvfs;
+
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+	if (zfsvfs->z_use_namecache)
+		return (vfs_cache_lookup(ap));
+	else
+		return (zfs_freebsd_lookup(ap));
+}
+
+static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
@@ -6265,6 +5016,7 @@ zfs_freebsd_create(ap)
 		struct vattr *a_vap;
 	} */ *ap;
 {
+	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int error, mode;
@@ -6273,13 +5025,13 @@ zfs_freebsd_create(ap)
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 
 	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
-#ifdef FREEBSD_NAMECACHE
-	if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+	if (zfsvfs->z_use_namecache &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
-#endif
 	return (error);
 }
 
@@ -6294,8 +5046,8 @@ zfs_freebsd_remove(ap)
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
-	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred, NULL, 0));
+	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+	    ap->a_cnp->cn_cred));
 }
 
 static int
@@ -6314,7 +5066,7 @@ zfs_freebsd_mkdir(ap)
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
-	    ap->a_cnp->cn_cred, NULL, 0, NULL));
+	    ap->a_cnp->cn_cred));
 }
 
 static int
@@ -6329,7 +5081,7 @@ zfs_freebsd_rmdir(ap)
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
-	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
+	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
@@ -6563,23 +5315,14 @@ zfs_freebsd_rename(ap)
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
-	/*
-	 * Check for cross-device rename.
-	 */
-	if ((fdvp->v_mount != tdvp->v_mount) ||
-	    (tvp && (fdvp->v_mount != tvp->v_mount)))
-		error = EXDEV;
-	else
-		error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
-		    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
-	if (tdvp == tvp)
-		VN_RELE(tdvp);
-	else
-		VN_URELE(tdvp);
-	if (tvp)
-		VN_URELE(tvp);
-	VN_RELE(fdvp);
-	VN_RELE(fvp);
+	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+	    ap->a_tcnp, ap->a_fcnp->cn_cred);
+
+	vrele(fdvp);
+	vrele(fvp);
+	vrele(tdvp);
+	if (tvp != NULL)
+		vrele(tvp);
 
 	return (error);
 }
@@ -7250,6 +5993,39 @@ zfs_vptocnp(struct vop_vptocnp_args *ap)
 	return (error);
 }
 
+#ifdef DIAGNOSTIC
+static int
+zfs_lock(ap)
+	struct vop_lock1_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		char *file;
+		int line;
+	} */ *ap;
+{
+	zfsvfs_t *zfsvfs;
+	znode_t *zp;
+	vnode_t *vp;
+	int flags;
+	int err;
+
+	vp = ap->a_vp;
+	flags = ap->a_flags;
+	if ((flags & LK_INTERLOCK) == 0 && (flags & LK_NOWAIT) == 0 &&
+	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
+		zfsvfs = zp->z_zfsvfs;
+		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+	}
+	err = vop_stdlock(ap);
+	if ((flags & LK_INTERLOCK) != 0 && (flags & LK_NOWAIT) == 0 &&
+	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
+		zfsvfs = zp->z_zfsvfs;
+		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+	}
+	return (err);
+}
+#endif
+
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
@@ -7259,12 +6035,8 @@ struct vop_vector zfs_vnodeops = {
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
-#ifdef FREEBSD_NAMECACHE
-	.vop_lookup =		vfs_cache_lookup,
+	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
-#else
-	.vop_lookup =		zfs_freebsd_lookup,
-#endif
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
@@ -7296,6 +6068,9 @@ struct vop_vector zfs_vnodeops = {
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 	.vop_vptocnp =		zfs_vptocnp,
+#ifdef DIAGNOSTIC
+	.vop_lock1 =		zfs_lock,
+#endif
 };
 
 struct vop_vector zfs_fifoops = {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index 3853838..c947e54 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -124,16 +124,12 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 
 	list_link_init(&zp->z_link_node);
 
-	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
-	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
 	zp->z_vnode = NULL;
 	zp->z_moved = 0;
@@ -150,14 +146,10 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	ASSERT(ZTOV(zp) == NULL);
 	vn_free(ZTOV(zp));
 	ASSERT(!list_link_active(&zp->z_link_node));
-	mutex_destroy(&zp->z_lock);
-	rw_destroy(&zp->z_parent_lock);
-	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	avl_destroy(&zp->z_range_avl);
 	mutex_destroy(&zp->z_range_lock);
 
-	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 }
 
@@ -559,8 +551,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
-	mutex_enter(&zp->z_lock);
-
 	ASSERT(zp->z_sa_hdl == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 	if (sa_hdl == NULL) {
@@ -580,7 +570,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
 		ZTOV(zp)->v_flag |= VROOT;
 
-	mutex_exit(&zp->z_lock);
 	vn_exists(ZTOV(zp));
 }
 
@@ -637,7 +626,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_vnode = vp;
 	vp->v_data = zp;
 
-	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	zp->z_moved = 0;
 
@@ -739,7 +727,14 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	/*
 	 * Acquire vnode lock before making it available to the world.
 	 */
+#ifdef DIAGNOSTIC
+	vop_lock1_t *orig_lock = vp->v_op->vop_lock1;
+	vp->v_op->vop_lock1 = vop_stdlock;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	vp->v_op->vop_lock1 = orig_lock;
+#else
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#endif
 	VN_LOCK_AREC(vp);
 	if (vp->v_type != VFIFO)
 		VN_LOCK_ASHARE(vp);
@@ -1161,54 +1156,55 @@ again:
 	if (hdl != NULL) {
 		zp  = sa_get_userdata(hdl);
 
-
 		/*
 		 * Since "SA" does immediate eviction we
 		 * should never find a sa handle that doesn't
 		 * know about the znode.
 		 */
-
 		ASSERT3P(zp, !=, NULL);
-
-		mutex_enter(&zp->z_lock);
 		ASSERT3U(zp->z_id, ==, obj_num);
-		if (zp->z_unlinked) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			vp = ZTOV(zp);
-			*zpp = zp;
-			err = 0;
-		}
+		*zpp = zp;
+		vp = ZTOV(zp);
 
 		/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
-		if (err == 0)
-			VN_HOLD(vp);
+		VN_HOLD(vp);
 
-		mutex_exit(&zp->z_lock);
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
-		if (err == 0) {
-			locked = VOP_ISLOCKED(vp);
-			VI_LOCK(vp);
-			if ((vp->v_iflag & VI_DOOMED) != 0 &&
-			    locked != LK_EXCLUSIVE) {
-				/*
-				 * The vnode is doomed and this thread doesn't
-				 * hold the exclusive lock on it, so the vnode
-				 * must be being reclaimed by another thread.
-				 * Otherwise the doomed vnode is being reclaimed
-				 * by this thread and zfs_zget is called from
-				 * ZIL internals.
-				 */
-				VI_UNLOCK(vp);
-				VN_RELE(vp);
-				goto again;
-			}
+		locked = VOP_ISLOCKED(vp);
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0 &&
+		    locked != LK_EXCLUSIVE) {
+			/*
+			 * The vnode is doomed and this thread doesn't
+			 * hold the exclusive lock on it, so the vnode
+			 * must be being reclaimed by another thread.
+			 * Otherwise the doomed vnode is being reclaimed
+			 * by this thread and zfs_zget is called from
+			 * ZIL internals.
+			 */
 			VI_UNLOCK(vp);
+
+			/*
+			 * XXX vrele() locks the vnode when the last reference
+			 * is dropped.  Although in this case the vnode is
+			 * doomed / dead and so no inactivation is required,
+			 * the vnode lock is still acquired.  That could result
+			 * in a LOR with z_teardown_lock if another thread holds
+			 * the vnode's lock and tries to take z_teardown_lock.
+			 * But that is only possible if the other thread peforms
+			 * a ZFS vnode operation on the vnode.  That either
+			 * should not happen if the vnode is dead or the thread
+			 * should also have a refrence to the vnode and thus
+			 * our reference is not last.
+			 */
+			VN_RELE(vp);
+			goto again;
 		}
+		VI_UNLOCK(vp);
 		getnewvnode_drop_reserve();
-		return (err);
+		return (0);
 	}
 
 	/*
@@ -1391,20 +1387,16 @@ zfs_zinactive(znode_t *zp)
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
-	mutex_enter(&zp->z_lock);
-
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
-		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		zfs_rmnode(zp);
 		return;
 	}
 
-	mutex_exit(&zp->z_lock);
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index e6b1e90..0da005a 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -290,7 +290,10 @@ dev/lindev/full.c		optional	lindev
 dev/lindev/lindev.c		optional	lindev
 dev/nfe/if_nfe.c		optional	nfe pci
 dev/ntb/if_ntb/if_ntb.c		optional	if_ntb
-dev/ntb/ntb_hw/ntb_hw.c		optional	if_ntb ntb_hw
+dev/ntb/ntb_transport.c		optional	if_ntb
+dev/ntb/ntb.c			optional	if_ntb | ntb_hw
+dev/ntb/ntb_if.m		optional	if_ntb | ntb_hw
+dev/ntb/ntb_hw/ntb_hw.c		optional	ntb_hw
 dev/nvd/nvd.c			optional	nvd nvme
 dev/nve/if_nve.c		optional	nve pci
 dev/nvme/nvme.c			optional	nvme
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index fe93e68..e46fe53 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -286,7 +286,10 @@ dev/mse/mse.c			optional mse
 dev/mse/mse_isa.c		optional mse isa
 dev/nfe/if_nfe.c		optional nfe pci
 dev/ntb/if_ntb/if_ntb.c		optional if_ntb
-dev/ntb/ntb_hw/ntb_hw.c		optional if_ntb | ntb_hw
+dev/ntb/ntb_transport.c		optional if_ntb
+dev/ntb/ntb.c			optional if_ntb | ntb_hw
+dev/ntb/ntb_if.m		optional if_ntb | ntb_hw
+dev/ntb/ntb_hw/ntb_hw.c		optional ntb_hw
 dev/nvd/nvd.c			optional nvd nvme
 dev/nve/if_nve.c		optional nve pci
 dev/nvme/nvme.c			optional nvme
diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c
index 9db1c44..1295de7 100644
--- a/sys/dev/ahci/ahci.c
+++ b/sys/dev/ahci/ahci.c
@@ -373,7 +373,8 @@ ahci_setup_interrupt(device_t dev)
 		else if (ctlr->numirqs == 1 || i >= ctlr->channels ||
 		    (ctlr->ccc && i == ctlr->cccv))
 			ctlr->irqs[i].mode = AHCI_IRQ_MODE_ALL;
-		else if (i == ctlr->numirqs - 1)
+		else if (ctlr->channels > ctlr->numirqs &&
+		    i == ctlr->numirqs - 1)
 			ctlr->irqs[i].mode = AHCI_IRQ_MODE_AFTER;
 		else
 			ctlr->irqs[i].mode = AHCI_IRQ_MODE_ONE;
@@ -422,6 +423,7 @@ ahci_intr(void *data)
 	} else {	/* AHCI_IRQ_MODE_AFTER */
 		unit = irq->r_irq_rid - 1;
 		is = ATA_INL(ctlr->r_mem, AHCI_IS);
+		is &= (0xffffffff << unit);
 	}
 	/* CCC interrupt is edge triggered. */
 	if (ctlr->ccc)
diff --git a/sys/dev/ahci/ahci_pci.c b/sys/dev/ahci/ahci_pci.c
index 22f28e2..bb14ed6 100644
--- a/sys/dev/ahci/ahci_pci.c
+++ b/sys/dev/ahci/ahci_pci.c
@@ -187,7 +187,7 @@ static const struct {
 	{0xa10f8086, 0x00, "Intel Sunrise Point (RAID)",	0},
 	{0x23238086, 0x00, "Intel DH89xxCC",	0},
 	{0x2360197b, 0x00, "JMicron JMB360",	0},
-	{0x2361197b, 0x00, "JMicron JMB361",	AHCI_Q_NOFORCE},
+	{0x2361197b, 0x00, "JMicron JMB361",	AHCI_Q_NOFORCE | AHCI_Q_1CH},
 	{0x2362197b, 0x00, "JMicron JMB362",	0},
 	{0x2363197b, 0x00, "JMicron JMB363",	AHCI_Q_NOFORCE},
 	{0x2365197b, 0x00, "JMicron JMB365",	AHCI_Q_NOFORCE},
diff --git a/sys/dev/e1000/e1000_api.c b/sys/dev/e1000/e1000_api.c
index 28379cc..52e2609 100644
--- a/sys/dev/e1000/e1000_api.c
+++ b/sys/dev/e1000/e1000_api.c
@@ -304,6 +304,10 @@ s32 e1000_set_mac_type(struct e1000_hw *hw)
 	case E1000_DEV_ID_PCH_SPT_I219_LM2:
 	case E1000_DEV_ID_PCH_SPT_I219_V2:
 	case E1000_DEV_ID_PCH_LBG_I219_LM3:
+	case E1000_DEV_ID_PCH_SPT_I219_LM4:
+	case E1000_DEV_ID_PCH_SPT_I219_V4:
+	case E1000_DEV_ID_PCH_SPT_I219_LM5:
+	case E1000_DEV_ID_PCH_SPT_I219_V5:
 		mac->type = e1000_pch_spt;
 		break;
 	case E1000_DEV_ID_82575EB_COPPER:
diff --git a/sys/dev/e1000/e1000_hw.h b/sys/dev/e1000/e1000_hw.h
index 1792e14..e1464a7 100644
--- a/sys/dev/e1000/e1000_hw.h
+++ b/sys/dev/e1000/e1000_hw.h
@@ -142,6 +142,10 @@ struct e1000_hw;
 #define E1000_DEV_ID_PCH_SPT_I219_LM2		0x15B7 /* Sunrise Point-H PCH */
 #define E1000_DEV_ID_PCH_SPT_I219_V2		0x15B8 /* Sunrise Point-H PCH */
 #define E1000_DEV_ID_PCH_LBG_I219_LM3		0x15B9 /* LEWISBURG PCH */
+#define E1000_DEV_ID_PCH_SPT_I219_LM4		0x15D7
+#define E1000_DEV_ID_PCH_SPT_I219_V4		0x15D8
+#define E1000_DEV_ID_PCH_SPT_I219_LM5		0x15E3
+#define E1000_DEV_ID_PCH_SPT_I219_V5		0x15D6
 #define E1000_DEV_ID_82576			0x10C9
 #define E1000_DEV_ID_82576_FIBER		0x10E6
 #define E1000_DEV_ID_82576_SERDES		0x10E7
@@ -957,9 +961,13 @@ struct e1000_dev_spec_ich8lan {
 	E1000_MUTEX nvm_mutex;
 	E1000_MUTEX swflag_mutex;
 	bool nvm_k1_enabled;
+	bool disable_k1_off;
 	bool eee_disable;
 	u16 eee_lp_ability;
 	enum e1000_ulp_state ulp_state;
+	bool ulp_capability_disabled;
+	bool during_suspend_flow;
+	bool during_dpg_exit;
 };
 
 struct e1000_dev_spec_82575 {
diff --git a/sys/dev/e1000/e1000_ich8lan.c b/sys/dev/e1000/e1000_ich8lan.c
index 9b9a090..4c93662 100644
--- a/sys/dev/e1000/e1000_ich8lan.c
+++ b/sys/dev/e1000/e1000_ich8lan.c
@@ -288,7 +288,7 @@ static void e1000_toggle_lanphypc_pch_lpt(struct e1000_hw *hw)
 	mac_reg &= ~E1000_CTRL_LANPHYPC_VALUE;
 	E1000_WRITE_REG(hw, E1000_CTRL, mac_reg);
 	E1000_WRITE_FLUSH(hw);
-	usec_delay(10);
+	msec_delay(1);
 	mac_reg &= ~E1000_CTRL_LANPHYPC_OVERRIDE;
 	E1000_WRITE_REG(hw, E1000_CTRL, mac_reg);
 	E1000_WRITE_FLUSH(hw);
@@ -1625,7 +1625,17 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 			hw->phy.ops.write_reg_locked(hw,
 						     I217_PLL_CLOCK_GATE_REG,
 						     phy_reg);
-		}
+
+			if (speed == SPEED_1000) {
+				hw->phy.ops.read_reg_locked(hw, HV_PM_CTRL,
+							    &phy_reg);
+
+				phy_reg |= HV_PM_CTRL_K1_CLK_REQ;
+
+				hw->phy.ops.write_reg_locked(hw, HV_PM_CTRL,
+							     phy_reg);
+				}
+		 }
 		hw->phy.ops.release(hw);
 
 		if (ret_val)
@@ -1718,7 +1728,8 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 		u32 pcieanacfg = E1000_READ_REG(hw, E1000_PCIEANACFG);
 		u32 fextnvm6 = E1000_READ_REG(hw, E1000_FEXTNVM6);
 
-		if (pcieanacfg & E1000_FEXTNVM6_K1_OFF_ENABLE)
+		if ((pcieanacfg & E1000_FEXTNVM6_K1_OFF_ENABLE) &&
+			(hw->dev_spec.ich8lan.disable_k1_off == FALSE))
 			fextnvm6 |= E1000_FEXTNVM6_K1_OFF_ENABLE;
 		else
 			fextnvm6 &= ~E1000_FEXTNVM6_K1_OFF_ENABLE;
diff --git a/sys/dev/e1000/e1000_ich8lan.h b/sys/dev/e1000/e1000_ich8lan.h
index edc1dd1..6d81291 100644
--- a/sys/dev/e1000/e1000_ich8lan.h
+++ b/sys/dev/e1000/e1000_ich8lan.h
@@ -239,7 +239,7 @@
 
 /* PHY Power Management Control */
 #define HV_PM_CTRL		PHY_REG(770, 17)
-#define HV_PM_CTRL_PLL_STOP_IN_K1_GIGA	0x100
+#define HV_PM_CTRL_K1_CLK_REQ		0x200
 #define HV_PM_CTRL_K1_ENABLE		0x4000
 
 #define I217_PLL_CLOCK_GATE_REG	PHY_REG(772, 28)
diff --git a/sys/dev/e1000/e1000_phy.c b/sys/dev/e1000/e1000_phy.c
index b2bec3e..9684b43 100644
--- a/sys/dev/e1000/e1000_phy.c
+++ b/sys/dev/e1000/e1000_phy.c
@@ -4148,10 +4148,10 @@ s32 e1000_read_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 *data)
 	/* Disable access to mPHY if it was originally disabled */
 	if (locked)
 		ready = e1000_is_mphy_ready(hw);
-		if (!ready)
-			return -E1000_ERR_PHY;
-		E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
-				E1000_MPHY_DIS_ACCESS);
+	if (!ready)
+		return -E1000_ERR_PHY;
+	E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
+			E1000_MPHY_DIS_ACCESS);
 
 	return E1000_SUCCESS;
 }
@@ -4213,10 +4213,10 @@ s32 e1000_write_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 data,
 	/* Disable access to mPHY if it was originally disabled */
 	if (locked)
 		ready = e1000_is_mphy_ready(hw);
-		if (!ready)
-			return -E1000_ERR_PHY;
-		E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
-				E1000_MPHY_DIS_ACCESS);
+	if (!ready)
+		return -E1000_ERR_PHY;
+	E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
+			E1000_MPHY_DIS_ACCESS);
 
 	return E1000_SUCCESS;
 }
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index 46f3e48..6b6b791 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -192,6 +192,12 @@ static em_vendor_info_t em_vendor_info_array[] =
 	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_V2, PCI_ANY_ID, PCI_ANY_ID, 0},
 	{ 0x8086, E1000_DEV_ID_PCH_LBG_I219_LM3,
 						PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_LM4,
+						PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_V4, PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_LM5,
+						PCI_ANY_ID, PCI_ANY_ID, 0},
+	{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_V5, PCI_ANY_ID, PCI_ANY_ID, 0},
 	/* required last entry */
 	{ 0, 0, 0, 0, 0}
 };
diff --git a/sys/dev/filemon/filemon.c b/sys/dev/filemon/filemon.c
index 919af9d..26e1bc3 100644
--- a/sys/dev/filemon/filemon.c
+++ b/sys/dev/filemon/filemon.c
@@ -137,6 +137,8 @@ filemon_proc_get(struct proc *p)
 {
 	struct filemon *filemon;
 
+	if (p->p_filemon == NULL)
+		return (NULL);
 	PROC_LOCK(p);
 	filemon = filemon_acquire(p->p_filemon);
 	PROC_UNLOCK(p);
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
index 936e4e1..18626cb 100644
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -810,6 +810,7 @@ hv_storvsc_on_iocompletion(struct storvsc_softc *sc,
 	 * because the fields will be used later in storvsc_io_done().
 	 */
 	request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status;
+	request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status;
 	request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len;
 
 	if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) &&
@@ -1945,28 +1946,6 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
 	return(0);
 }
 
-/*
- * SCSI Inquiry checks qualifier and type.
- * If qualifier is 011b, means the device server is not capable
- * of supporting a peripheral device on this logical unit, and
- * the type should be set to 1Fh.
- * 
- * Return 1 if it is valid, 0 otherwise.
- */
-static inline int
-is_inquiry_valid(const struct scsi_inquiry_data *inq_data)
-{
-	uint8_t type;
-	if (SID_QUAL(inq_data) != SID_QUAL_LU_CONNECTED) {
-		return (0);
-	}
-	type = SID_TYPE(inq_data);
-	if (type == T_NODEVICE) {
-		return (0);
-	}
-	return (1);
-}
-
 /**
  * @brief completion function before returning to CAM
  *
@@ -1985,7 +1964,6 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
 	struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
 	bus_dma_segment_t *ori_sglist = NULL;
 	int ori_sg_count = 0;
-
 	/* destroy bounce buffer if it is used */
 	if (reqp->bounce_sgl_count) {
 		ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
@@ -2040,88 +2018,71 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 	if (vm_srb->scsi_status == SCSI_STATUS_OK) {
 		const struct scsi_generic *cmd;
-		/*
-		 * Check whether the data for INQUIRY cmd is valid or
-		 * not.  Windows 10 and Windows 2016 send all zero
-		 * inquiry data to VM even for unpopulated slots.
-		 */
+
+		if (vm_srb->srb_status != SRB_STATUS_SUCCESS) {
+			if (vm_srb->srb_status == SRB_STATUS_INVALID_LUN) {
+				xpt_print(ccb->ccb_h.path, "invalid LUN %d\n",
+				    vm_srb->lun);
+			} else {
+				xpt_print(ccb->ccb_h.path, "Unknown SRB flag: %d\n",
+				    vm_srb->srb_status);
+			}
+			/*
+			 * If there are errors, for example, invalid LUN,
+			 * host will inform VM through SRB status.
+			 */
+			ccb->ccb_h.status |= CAM_SEL_TIMEOUT;
+		} else {
+			ccb->ccb_h.status |= CAM_REQ_CMP;
+		}
+
 		cmd = (const struct scsi_generic *)
 		    ((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
 		     csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes);
 		if (cmd->opcode == INQUIRY) {
-		    /*
-		     * The host of Windows 10 or 2016 server will response
-		     * the inquiry request with invalid data for unexisted device:
-			[0x7f 0x0 0x5 0x2 0x1f ... ]
-		     * But on windows 2012 R2, the response is:
-			[0x7f 0x0 0x0 0x0 0x0 ]
-		     * That is why here wants to validate the inquiry response.
-		     * The validation will skip the INQUIRY whose response is short,
-		     * which is less than SHORT_INQUIRY_LENGTH (36).
-		     *
-		     * For more information about INQUIRY, please refer to:
-		     *  ftp://ftp.avc-pioneer.com/Mtfuji_7/Proposal/Jun09/INQUIRY.pdf
-		     */
-		    struct scsi_inquiry_data *inq_data =
-			(struct scsi_inquiry_data *)csio->data_ptr;
-		    uint8_t* resp_buf = (uint8_t*)csio->data_ptr;
-		    /* Get the buffer length reported by host */
-		    int resp_xfer_len = vm_srb->transfer_len;
-		    /* Get the available buffer length */
-		    int resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
-		    int data_len = (resp_buf_len < resp_xfer_len) ? resp_buf_len : resp_xfer_len;
-		    if (data_len < SHORT_INQUIRY_LENGTH) {
-			ccb->ccb_h.status |= CAM_REQ_CMP;
-			if (bootverbose && data_len >= 5) {
-				mtx_lock(&sc->hs_lock);
-				xpt_print(ccb->ccb_h.path,
-				    "storvsc skips the validation for short inquiry (%d)"
-				    " [%x %x %x %x %x]\n",
-				    data_len,resp_buf[0],resp_buf[1],resp_buf[2],
-				    resp_buf[3],resp_buf[4]);
-				mtx_unlock(&sc->hs_lock);
-			}
-		    } else if (is_inquiry_valid(inq_data) == 0) {
-			ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
+			struct scsi_inquiry_data *inq_data =
+			    (struct scsi_inquiry_data *)csio->data_ptr;
+			uint8_t *resp_buf = (uint8_t *)csio->data_ptr;
+			int resp_xfer_len, resp_buf_len, data_len;
+
+			/* Get the buffer length reported by host */
+			resp_xfer_len = vm_srb->transfer_len;
+			/* Get the available buffer length */
+			resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
+			data_len = (resp_buf_len < resp_xfer_len) ?
+			    resp_buf_len : resp_xfer_len;
+
 			if (bootverbose && data_len >= 5) {
-				mtx_lock(&sc->hs_lock);
-				xpt_print(ccb->ccb_h.path,
-				    "storvsc uninstalled invalid device"
-				    " [%x %x %x %x %x]\n",
-				resp_buf[0],resp_buf[1],resp_buf[2],resp_buf[3],resp_buf[4]);
-				mtx_unlock(&sc->hs_lock);
+				xpt_print(ccb->ccb_h.path, "storvsc inquiry "
+				    "(%d) [%x %x %x %x %x ... ]\n", data_len,
+				    resp_buf[0], resp_buf[1], resp_buf[2],
+				    resp_buf[3], resp_buf[4]);
 			}
-		    } else {
-			char vendor[16];
-			cam_strvis(vendor, inq_data->vendor, sizeof(inq_data->vendor),
-				sizeof(vendor));
-			/**
-			 * XXX: upgrade SPC2 to SPC3 if host is WIN8 or WIN2012 R2
-			 * in order to support UNMAP feature
-			 */
-			if (!strncmp(vendor,"Msft",4) &&
-			     SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
-			     (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
-				vmstor_proto_version== VMSTOR_PROTOCOL_VERSION_WIN8)) {
-				inq_data->version = SCSI_REV_SPC3;
-				if (bootverbose) {
-					mtx_lock(&sc->hs_lock);
-					xpt_print(ccb->ccb_h.path,
-						"storvsc upgrades SPC2 to SPC3\n");
-					mtx_unlock(&sc->hs_lock);
+			if (vm_srb->srb_status == SRB_STATUS_SUCCESS &&
+			    data_len > SHORT_INQUIRY_LENGTH) {
+				char vendor[16];
+
+				cam_strvis(vendor, inq_data->vendor,
+				    sizeof(inq_data->vendor), sizeof(vendor));
+
+				/*
+				 * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or
+				 * WIN2012 R2 in order to support UNMAP feature.
+				 */
+				if (!strncmp(vendor, "Msft", 4) &&
+				    SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
+				    (vmstor_proto_version ==
+				     VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
+				     vmstor_proto_version ==
+				     VMSTOR_PROTOCOL_VERSION_WIN8)) {
+					inq_data->version = SCSI_REV_SPC3;
+					if (bootverbose) {
+						xpt_print(ccb->ccb_h.path,
+						    "storvsc upgrades "
+						    "SPC2 to SPC3\n");
+					}
 				}
 			}
-			ccb->ccb_h.status |= CAM_REQ_CMP;
-			if (bootverbose) {
-				mtx_lock(&sc->hs_lock);
-				xpt_print(ccb->ccb_h.path,
-				    "storvsc has passed inquiry response (%d) validation\n",
-				    data_len);
-				mtx_unlock(&sc->hs_lock);
-			}
-		    }
-		} else {
-			ccb->ccb_h.status |= CAM_REQ_CMP;
 		}
 	} else {
 		mtx_lock(&sc->hs_lock);
diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h
index f2b9480..9205e35 100644
--- a/sys/dev/hyperv/storvsc/hv_vstorage.h
+++ b/sys/dev/hyperv/storvsc/hv_vstorage.h
@@ -249,9 +249,9 @@ struct vstor_packet {
 /**
  * SRB Status Masks (can be combined with above status codes)
  */
-#define SRB_STATUS_QUEUE_FROZEN		0x40
-#define SRB_STATUS_AUTOSENSE_VALID	0x80
-
+#define SRB_STATUS_QUEUE_FROZEN         0x40
+#define SRB_STATUS_AUTOSENSE_VALID      0x80
+#define SRB_STATUS_INVALID_LUN          0X20
 
 /**
  *  Packet flags
diff --git a/sys/dev/isp/isp.c b/sys/dev/isp/isp.c
index aa36453..9d38f60 100644
--- a/sys/dev/isp/isp.c
+++ b/sys/dev/isp/isp.c
@@ -2431,6 +2431,7 @@ isp_fc_enable_vp(ispsoftc_t *isp, int chan)
 		    __func__, chan, vp.vp_mod_hdr.rqs_flags, vp.vp_mod_status);
 		return (EIO);
 	}
+	GET_NANOTIME(&isp->isp_init_time);
 	return (0);
 }
 
@@ -5865,6 +5866,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
 		 * These are broadcast events that have to be sent across
 		 * all active channels.
 		 */
+		GET_NANOTIME(&isp->isp_init_time);
 		for (chan = 0; chan < isp->isp_nchan; chan++) {
 			fcp = FCPARAM(isp, chan);
 			int topo = fcp->isp_topo;
@@ -5921,6 +5923,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
 		 * This is a broadcast event that has to be sent across
 		 * all active channels.
 		 */
+		GET_NANOTIME(&isp->isp_init_time);
 		for (chan = 0; chan < isp->isp_nchan; chan++) {
 			fcp = FCPARAM(isp, chan);
 			if (fcp->role == ISP_ROLE_NONE)
@@ -5964,6 +5967,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
 		 * This is a broadcast event that has to be sent across
 		 * all active channels.
 		 */
+		GET_NANOTIME(&isp->isp_init_time);
 		for (chan = 0; chan < isp->isp_nchan; chan++) {
 			fcp = FCPARAM(isp, chan);
 			if (fcp->role == ISP_ROLE_NONE)
@@ -6162,6 +6166,7 @@ isp_handle_other_response(ispsoftc_t *isp, int type, isphdr_t *hp, uint32_t *opt
 		portid = (uint32_t)rid.ridacq_vp_port_hi << 16 |
 		    rid.ridacq_vp_port_lo;
 		if (rid.ridacq_format == 0) {
+			GET_NANOTIME(&isp->isp_init_time);
 			for (chan = 0; chan < isp->isp_nchan; chan++) {
 				fcparam *fcp = FCPARAM(isp, chan);
 				if (fcp->role == ISP_ROLE_NONE)
diff --git a/sys/dev/isp/isp_freebsd.c b/sys/dev/isp/isp_freebsd.c
index c6b8dc4..cfaccea 100644
--- a/sys/dev/isp/isp_freebsd.c
+++ b/sys/dev/isp/isp_freebsd.c
@@ -856,7 +856,7 @@ static void isp_handle_platform_atio7(ispsoftc_t *, at7_entry_t *);
 static void isp_handle_platform_ctio(ispsoftc_t *, void *);
 static void isp_handle_platform_notify_fc(ispsoftc_t *, in_fcentry_t *);
 static void isp_handle_platform_notify_24xx(ispsoftc_t *, in_fcentry_24xx_t *);
-static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *);
+static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *, uint32_t rsp);
 static void isp_handle_platform_target_tmf(ispsoftc_t *, isp_notify_t *);
 static void isp_target_mark_aborted(ispsoftc_t *, union ccb *);
 static void isp_target_mark_aborted_early(ispsoftc_t *, tstate_t *, uint32_t);
@@ -2003,7 +2003,7 @@ noresrc:
 	ntp = isp_get_ntpd(isp, tptr);
 	if (ntp == NULL) {
 		rls_lun_statep(isp, tptr);
-		isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_BUSY, 0);
+		isp_endcmd(isp, aep, SCSI_STATUS_BUSY, 0);
 		return;
 	}
 	memcpy(ntp->rd.data, aep, QENTRY_LEN);
@@ -2055,7 +2055,7 @@ isp_handle_platform_atio7(ispsoftc_t *isp, at7_entry_t *aep)
 			 * It's a bit tricky here as we need to stash this command *somewhere*.
 			 */
 			GET_NANOTIME(&now);
-			if (NANOTIME_SUB(&isp->isp_init_time, &now) > 2000000000ULL) {
+			if (NANOTIME_SUB(&now, &isp->isp_init_time) > 2000000000ULL) {
 				isp_prt(isp, ISP_LOGWARN, "%s: [RX_ID 0x%x] D_ID %x not found on any channel- dropping", __func__, aep->at_rxid, did);
 				isp_endcmd(isp, aep, NIL_HANDLE, ISP_NOCHAN, ECMD_TERMINATE, 0);
 				return;
@@ -2761,7 +2761,7 @@ isp_handle_platform_notify_24xx(ispsoftc_t *isp, in_fcentry_24xx_t *inot)
 }
 
 static int
-isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp)
+isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp, uint32_t rsp)
 {
 
 	if (isp->isp_state != ISP_RUNSTATE) {
@@ -2796,6 +2796,15 @@ isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp)
 		cto->ct_oxid = aep->at_hdr.ox_id;
 		cto->ct_flags = CT7_SENDSTATUS|CT7_NOACK|CT7_NO_DATA|CT7_FLAG_MODE1;
 		cto->ct_flags |= (aep->at_ta_len >> 12) << CT7_TASK_ATTR_SHIFT;
+		if (rsp != 0) {
+			cto->ct_scsi_status |= (FCP_RSPLEN_VALID << 8);
+			cto->rsp.m1.ct_resplen = 4;
+			ISP_MEMZERO(cto->rsp.m1.ct_resp, sizeof (cto->rsp.m1.ct_resp));
+			cto->rsp.m1.ct_resp[0] = rsp & 0xff;
+			cto->rsp.m1.ct_resp[1] = (rsp >> 8) & 0xff;
+			cto->rsp.m1.ct_resp[2] = (rsp >> 16) & 0xff;
+			cto->rsp.m1.ct_resp[3] = (rsp >> 24) & 0xff;
+		}
 		return (isp_target_put_entry(isp, &local));
 	}
 
@@ -3642,7 +3651,8 @@ isp_action(struct cam_sim *sim, union ccb *ccb)
 			xpt_done(ccb);
 			break;
 		}
-		if (isp_handle_platform_target_notify_ack(isp, &ntp->rd.nt)) {
+		if (isp_handle_platform_target_notify_ack(isp, &ntp->rd.nt,
+		    (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0)) {
 			rls_lun_statep(isp, tptr);
 			cam_freeze_devq(ccb->ccb_h.path);
 			cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 1000, 0);
@@ -4407,11 +4417,11 @@ changed:
 			/*
 			 * This is device arrival/departure notification
 			 */
-			isp_handle_platform_target_notify_ack(isp, notify);
+			isp_handle_platform_target_notify_ack(isp, notify, 0);
 			break;
 		default:
 			isp_prt(isp, ISP_LOGALL, "target notify code 0x%x", notify->nt_ncode);
-			isp_handle_platform_target_notify_ack(isp, notify);
+			isp_handle_platform_target_notify_ack(isp, notify, 0);
 			break;
 		}
 		break;
diff --git a/sys/dev/ntb/if_ntb/if_ntb.c b/sys/dev/ntb/if_ntb/if_ntb.c
index d107d06..33645c4 100644
--- a/sys/dev/ntb/if_ntb/if_ntb.c
+++ b/sys/dev/ntb/if_ntb/if_ntb.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
@@ -25,21 +26,27 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a driver for simulated Ethernet device, using
+ * underlying NTB Transport device.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
-#include <sys/bitset.h>
+#include <sys/buf_ring.h>
 #include <sys/bus.h>
-#include <sys/ktr.h>
 #include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
 #include <sys/module.h>
-#include <sys/mutex.h>
-#include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
@@ -48,426 +55,163 @@ __FBSDID("$FreeBSD$");
 #include <net/if.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
+#include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
 #include <machine/bus.h>
-#include <machine/cpufunc.h>
-#include <machine/pmap.h>
-
-#include <netinet/in.h>
-#include <netinet/ip.h>
-
-#include "../ntb_hw/ntb_hw.h"
-
-/*
- * The Non-Transparent Bridge (NTB) is a device on some Intel processors that
- * allows you to connect two systems using a PCI-e link.
- *
- * This module contains a protocol for sending and receiving messages, and
- * exposes that protocol through a simulated ethernet device called ntb.
- *
- * NOTE: Much of the code in this module is shared with Linux. Any patches may
- * be picked up and redistributed in Linux with a dual GPL/BSD license.
- */
 
-#define QP_SETSIZE	64
-BITSET_DEFINE(_qpset, QP_SETSIZE);
-#define test_bit(pos, addr)	BIT_ISSET(QP_SETSIZE, (pos), (addr))
-#define set_bit(pos, addr)	BIT_SET(QP_SETSIZE, (pos), (addr))
-#define clear_bit(pos, addr)	BIT_CLR(QP_SETSIZE, (pos), (addr))
-#define ffs_bit(addr)		BIT_FFS(QP_SETSIZE, (addr))
+#include "../ntb_transport.h"
 
 #define KTR_NTB KTR_SPARE3
+#define NTB_MEDIATYPE		 (IFM_ETHER | IFM_AUTO | IFM_FDX)
 
-#define NTB_TRANSPORT_VERSION	4
-#define NTB_RX_MAX_PKTS		64
-#define	NTB_RXQ_SIZE		300
-
-enum ntb_link_event {
-	NTB_LINK_DOWN = 0,
-	NTB_LINK_UP,
-};
+#define	NTB_CSUM_FEATURES	(CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)
+#define	NTB_CSUM_FEATURES6	(CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6)
+#define	NTB_CSUM_SET		(CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \
+				    CSUM_PSEUDO_HDR | \
+				    CSUM_IP_CHECKED | CSUM_IP_VALID | \
+				    CSUM_SCTP_VALID)
 
 static SYSCTL_NODE(_hw, OID_AUTO, if_ntb, CTLFLAG_RW, 0, "if_ntb");
 
-static unsigned g_if_ntb_debug_level;
-TUNABLE_INT("hw.if_ntb.debug_level", &g_if_ntb_debug_level);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, debug_level, CTLFLAG_RWTUN,
-    &g_if_ntb_debug_level, 0, "if_ntb log level -- higher is more verbose");
-#define ntb_printf(lvl, ...) do {			\
-	if ((lvl) <= g_if_ntb_debug_level) {		\
-		if_printf(nt->ifp, __VA_ARGS__);	\
-	}						\
-} while (0)
-
-static unsigned transport_mtu = IP_MAXPACKET + ETHER_HDR_LEN + ETHER_CRC_LEN;
-
-static uint64_t max_mw_size;
-TUNABLE_QUAD("hw.if_ntb.max_mw_size", &max_mw_size);
-SYSCTL_UQUAD(_hw_if_ntb, OID_AUTO, max_mw_size, CTLFLAG_RDTUN, &max_mw_size, 0,
-    "If enabled (non-zero), limit the size of large memory windows. "
-    "Both sides of the NTB MUST set the same value here.");
-
-static unsigned max_num_clients;
-TUNABLE_INT("hw.if_ntb.max_num_clients", &max_num_clients);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, max_num_clients, CTLFLAG_RDTUN,
-    &max_num_clients, 0, "Maximum number of NTB transport clients.  "
-    "0 (default) - use all available NTB memory windows; "
-    "positive integer N - Limit to N memory windows.");
-
-static unsigned enable_xeon_watchdog;
-TUNABLE_INT("hw.if_ntb.enable_xeon_watchdog", &enable_xeon_watchdog);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, enable_xeon_watchdog, CTLFLAG_RDTUN,
-    &enable_xeon_watchdog, 0, "If non-zero, write a register every second to "
-    "keep a watchdog from tearing down the NTB link");
-
-STAILQ_HEAD(ntb_queue_list, ntb_queue_entry);
-
-typedef uint32_t ntb_q_idx_t;
-
-struct ntb_queue_entry {
-	/* ntb_queue list reference */
-	STAILQ_ENTRY(ntb_queue_entry) entry;
-
-	/* info on data to be transferred */
-	void		*cb_data;
-	void		*buf;
-	uint32_t	len;
-	uint32_t	flags;
-
-	struct ntb_transport_qp		*qp;
-	struct ntb_payload_header	*x_hdr;
-	ntb_q_idx_t	index;
-};
-
-struct ntb_rx_info {
-	ntb_q_idx_t	entry;
-};
-
-struct ntb_transport_qp {
-	struct ntb_transport_ctx	*transport;
-	struct ntb_softc	*ntb;
-
-	void			*cb_data;
-
-	bool			client_ready;
-	volatile bool		link_is_up;
-	uint8_t			qp_num;	/* Only 64 QPs are allowed.  0-63 */
-
-	struct ntb_rx_info	*rx_info;
-	struct ntb_rx_info	*remote_rx_info;
-
-	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	struct ntb_queue_list	tx_free_q;
-	struct mtx		ntb_tx_free_q_lock;
-	caddr_t			tx_mw;
-	bus_addr_t		tx_mw_phys;
-	ntb_q_idx_t		tx_index;
-	ntb_q_idx_t		tx_max_entry;
-	uint64_t		tx_max_frame;
-
-	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	struct ntb_queue_list	rx_post_q;
-	struct ntb_queue_list	rx_pend_q;
-	/* ntb_rx_q_lock: synchronize access to rx_XXXX_q */
-	struct mtx		ntb_rx_q_lock;
-	struct task		rx_completion_task;
-	struct task		rxc_db_work;
-	caddr_t			rx_buff;
-	ntb_q_idx_t		rx_index;
-	ntb_q_idx_t		rx_max_entry;
-	uint64_t		rx_max_frame;
-
-	void (*event_handler)(void *data, enum ntb_link_event status);
-	struct callout		link_work;
-	struct callout		queue_full;
-	struct callout		rx_full;
-
-	uint64_t		last_rx_no_buf;
+static unsigned g_if_ntb_num_queues = UINT_MAX;
+SYSCTL_UINT(_hw_if_ntb, OID_AUTO, num_queues, CTLFLAG_RWTUN,
+    &g_if_ntb_num_queues, 0, "Number of queues per interface");
 
-	/* Stats */
-	uint64_t		rx_bytes;
-	uint64_t		rx_pkts;
-	uint64_t		rx_ring_empty;
-	uint64_t		rx_err_no_buf;
-	uint64_t		rx_err_oflow;
-	uint64_t		rx_err_ver;
-	uint64_t		tx_bytes;
-	uint64_t		tx_pkts;
-	uint64_t		tx_ring_full;
-	uint64_t		tx_err_no_buf;
-};
-
-struct ntb_queue_handlers {
-	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-	    void *data, int len);
-	void (*event_handler)(void *data, enum ntb_link_event status);
-};
-
-struct ntb_transport_mw {
-	vm_paddr_t	phys_addr;
-	size_t		phys_size;
-	size_t		xlat_align;
-	size_t		xlat_align_size;
-	bus_addr_t	addr_limit;
-	/* Tx buff is off vbase / phys_addr */
-	caddr_t		vbase;
-	size_t		xlat_size;
-	size_t		buff_size;
-	/* Rx buff is off virt_addr / dma_addr */
-	caddr_t		virt_addr;
-	bus_addr_t	dma_addr;
-};
-
-struct ntb_transport_ctx {
-	struct ntb_softc	*ntb;
+struct ntb_net_queue {
+	struct ntb_net_ctx	*sc;
 	struct ifnet		*ifp;
-	struct ntb_transport_mw	mw_vec[NTB_MAX_NUM_MW];
-	struct ntb_transport_qp	*qp_vec;
-	struct _qpset		qp_bitmap;
-	struct _qpset		qp_bitmap_free;
-	unsigned		mw_count;
-	unsigned		qp_count;
-	volatile bool		link_is_up;
-	struct callout		link_work;
-	struct callout		link_watchdog;
-	struct task		link_cleanup;
-	uint64_t		bufsize;
-	u_char			eaddr[ETHER_ADDR_LEN];
-	struct mtx		tx_lock;
-	struct mtx		rx_lock;
-
-	/* The hardcoded single queuepair in ntb_setup_interface() */
 	struct ntb_transport_qp *qp;
+	struct buf_ring		*br;
+	struct task		 tx_task;
+	struct taskqueue	*tx_tq;
+	struct mtx		 tx_lock;
+	struct callout		 queue_full;
 };
 
-static struct ntb_transport_ctx net_softc;
-
-enum {
-	IF_NTB_DESC_DONE_FLAG = 1 << 0,
-	IF_NTB_LINK_DOWN_FLAG = 1 << 1,
-};
-
-struct ntb_payload_header {
-	ntb_q_idx_t ver;
-	uint32_t len;
-	uint32_t flags;
-};
-
-enum {
-	/*
-	 * The order of this enum is part of the if_ntb remote protocol.  Do
-	 * not reorder without bumping protocol version (and it's probably best
-	 * to keep the protocol in lock-step with the Linux NTB driver.
-	 */
-	IF_NTB_VERSION = 0,
-	IF_NTB_QP_LINKS,
-	IF_NTB_NUM_QPS,
-	IF_NTB_NUM_MWS,
-	/*
-	 * N.B.: transport_link_work assumes MW1 enums = MW0 + 2.
-	 */
-	IF_NTB_MW0_SZ_HIGH,
-	IF_NTB_MW0_SZ_LOW,
-	IF_NTB_MW1_SZ_HIGH,
-	IF_NTB_MW1_SZ_LOW,
-	IF_NTB_MAX_SPAD,
-
-	/*
-	 * Some NTB-using hardware have a watchdog to work around NTB hangs; if
-	 * a register or doorbell isn't written every few seconds, the link is
-	 * torn down.  Write an otherwise unused register every few seconds to
-	 * work around this watchdog.
-	 */
-	IF_NTB_WATCHDOG_SPAD = 15
+struct ntb_net_ctx {
+	struct ifnet 		*ifp;
+	struct ifmedia		 media;
+	u_char			 eaddr[ETHER_ADDR_LEN];
+	int			 num_queues;
+	struct ntb_net_queue	*queues;
+	int			 mtu;
 };
-CTASSERT(IF_NTB_WATCHDOG_SPAD < XEON_SPAD_COUNT &&
-    IF_NTB_WATCHDOG_SPAD < ATOM_SPAD_COUNT);
-
-#define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
-#define NTB_QP_DEF_NUM_ENTRIES	100
-#define NTB_LINK_DOWN_TIMEOUT	10
 
-static int ntb_handle_module_events(struct module *m, int what, void *arg);
-static int ntb_setup_interface(void);
-static int ntb_teardown_interface(void);
+static int ntb_net_probe(device_t dev);
+static int ntb_net_attach(device_t dev);
+static int ntb_net_detach(device_t dev);
 static void ntb_net_init(void *arg);
+static int ntb_ifmedia_upd(struct ifnet *);
+static void ntb_ifmedia_sts(struct ifnet *, struct ifmediareq *);
 static int ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
-static void ntb_start(struct ifnet *ifp);
+static int ntb_transmit(struct ifnet *ifp, struct mbuf *m);
 static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
     void *data, int len);
 static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data,
     void *data, int len);
 static void ntb_net_event_handler(void *data, enum ntb_link_event status);
-static int ntb_transport_probe(struct ntb_softc *ntb);
-static void ntb_transport_free(struct ntb_transport_ctx *);
-static void ntb_transport_init_queue(struct ntb_transport_ctx *nt,
-    unsigned int qp_num);
-static void ntb_transport_free_queue(struct ntb_transport_qp *qp);
-static struct ntb_transport_qp *ntb_transport_create_queue(void *data,
-    struct ntb_softc *pdev, const struct ntb_queue_handlers *handlers);
-static void ntb_transport_link_up(struct ntb_transport_qp *qp);
-static int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb,
-    void *data, unsigned int len);
-static int ntb_process_tx(struct ntb_transport_qp *qp,
-    struct ntb_queue_entry *entry);
-static void ntb_memcpy_tx(struct ntb_transport_qp *qp,
-    struct ntb_queue_entry *entry, void *offset);
+static void ntb_handle_tx(void *arg, int pending);
 static void ntb_qp_full(void *arg);
-static void ntb_transport_rxc_db(void *arg, int pending);
-static int ntb_process_rxc(struct ntb_transport_qp *qp);
-static void ntb_memcpy_rx(struct ntb_transport_qp *qp,
-    struct ntb_queue_entry *entry, void *offset);
-static inline void ntb_rx_copy_callback(struct ntb_transport_qp *qp,
-    void *data);
-static void ntb_complete_rxc(void *arg, int pending);
-static void ntb_transport_doorbell_callback(void *data, uint32_t vector);
-static void ntb_transport_event_callback(void *data);
-static void ntb_transport_link_work(void *arg);
-static int ntb_set_mw(struct ntb_transport_ctx *, int num_mw, size_t size);
-static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw);
-static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
-    unsigned int qp_num);
-static void ntb_qp_link_work(void *arg);
-static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt);
-static void ntb_transport_link_cleanup_work(void *, int);
-static void ntb_qp_link_down(struct ntb_transport_qp *qp);
-static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp);
-static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp);
-static void ntb_transport_link_down(struct ntb_transport_qp *qp);
-static void ntb_send_link_down(struct ntb_transport_qp *qp);
-static void ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
-    struct ntb_queue_list *list);
-static struct ntb_queue_entry *ntb_list_rm(struct mtx *lock,
-    struct ntb_queue_list *list);
-static struct ntb_queue_entry *ntb_list_mv(struct mtx *lock,
-    struct ntb_queue_list *from, struct ntb_queue_list *to);
+static void ntb_qflush(struct ifnet *ifp);
 static void create_random_local_eui48(u_char *eaddr);
-static unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
-static void xeon_link_watchdog_hb(void *);
-
-static const struct ntb_ctx_ops ntb_transport_ops = {
-	.link_event = ntb_transport_event_callback,
-	.db_event = ntb_transport_doorbell_callback,
-};
 
-MALLOC_DEFINE(M_NTB_IF, "if_ntb", "ntb network driver");
-
-static inline void
-iowrite32(uint32_t val, void *addr)
-{
-
-	bus_space_write_4(X86_BUS_SPACE_MEM, 0/* HACK */, (uintptr_t)addr,
-	    val);
-}
-
-/* Module setup and teardown */
 static int
-ntb_handle_module_events(struct module *m, int what, void *arg)
+ntb_net_probe(device_t dev)
 {
-	int err = 0;
 
-	switch (what) {
-	case MOD_LOAD:
-		err = ntb_setup_interface();
-		break;
-	case MOD_UNLOAD:
-		err = ntb_teardown_interface();
-		break;
-	default:
-		err = EOPNOTSUPP;
-		break;
-	}
-	return (err);
+	device_set_desc(dev, "NTB Network Interface");
+	return (0);
 }
 
-static moduledata_t if_ntb_mod = {
-	"if_ntb",
-	ntb_handle_module_events,
-	NULL
-};
-
-DECLARE_MODULE(if_ntb, if_ntb_mod, SI_SUB_KLD, SI_ORDER_ANY);
-MODULE_DEPEND(if_ntb, ntb_hw, 1, 1, 1);
-
 static int
-ntb_setup_interface(void)
+ntb_net_attach(device_t dev)
 {
+	struct ntb_net_ctx *sc = device_get_softc(dev);
+	struct ntb_net_queue *q;
 	struct ifnet *ifp;
 	struct ntb_queue_handlers handlers = { ntb_net_rx_handler,
 	    ntb_net_tx_handler, ntb_net_event_handler };
-	int rc;
-
-	net_softc.ntb = devclass_get_softc(devclass_find("ntb_hw"), 0);
-	if (net_softc.ntb == NULL) {
-		printf("ntb: Cannot find devclass\n");
-		return (ENXIO);
-	}
+	int i;
 
-	ifp = net_softc.ifp = if_alloc(IFT_ETHER);
+	ifp = sc->ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
-		ntb_transport_free(&net_softc);
 		printf("ntb: Cannot allocate ifnet structure\n");
 		return (ENOMEM);
 	}
-	if_initname(ifp, "ntb", 0);
-
-	rc = ntb_transport_probe(net_softc.ntb);
-	if (rc != 0) {
-		printf("ntb: Cannot init transport: %d\n", rc);
-		if_free(net_softc.ifp);
-		return (rc);
-	}
+	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+
+	sc->num_queues = min(g_if_ntb_num_queues,
+	    ntb_transport_queue_count(dev));
+	sc->queues = malloc(sc->num_queues * sizeof(struct ntb_net_queue),
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+	sc->mtu = INT_MAX;
+	for (i = 0; i < sc->num_queues; i++) {
+		q = &sc->queues[i];
+		q->sc = sc;
+		q->ifp = ifp;
+		q->qp = ntb_transport_create_queue(dev, i, &handlers, q);
+		if (q->qp == NULL)
+			break;
+		sc->mtu = imin(sc->mtu, ntb_transport_max_size(q->qp));
+		mtx_init(&q->tx_lock, "ntb tx", NULL, MTX_DEF);
+		q->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &q->tx_lock);
+		TASK_INIT(&q->tx_task, 0, ntb_handle_tx, q);
+		q->tx_tq = taskqueue_create_fast("ntb_txq", M_NOWAIT,
+		    taskqueue_thread_enqueue, &q->tx_tq);
+		taskqueue_start_threads(&q->tx_tq, 1, PI_NET, "%s txq%d",
+		    device_get_nameunit(dev), i);
+		callout_init(&q->queue_full, 1);
+	}
+	sc->num_queues = i;
+	device_printf(dev, "%d queue(s)\n", sc->num_queues);
 
-	net_softc.qp = ntb_transport_create_queue(ifp, net_softc.ntb,
-	    &handlers);
 	ifp->if_init = ntb_net_init;
-	ifp->if_softc = &net_softc;
-	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX;
+	ifp->if_softc = sc;
+	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = ntb_ioctl;
-	ifp->if_start = ntb_start;
-	IFQ_SET_MAXLEN(&ifp->if_snd, IFQ_MAXLEN);
-	ifp->if_snd.ifq_drv_maxlen = IFQ_MAXLEN;
-	IFQ_SET_READY(&ifp->if_snd);
-	create_random_local_eui48(net_softc.eaddr);
-	ether_ifattach(ifp, net_softc.eaddr);
-	ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_JUMBO_MTU;
-	ifp->if_capenable = ifp->if_capabilities;
-	ifp->if_mtu = ntb_transport_max_size(net_softc.qp) - ETHER_HDR_LEN -
-	    ETHER_CRC_LEN;
-
-	ntb_transport_link_up(net_softc.qp);
-	net_softc.bufsize = ntb_transport_max_size(net_softc.qp) +
-	    sizeof(struct ether_header);
+	ifp->if_transmit = ntb_transmit;
+	ifp->if_qflush = ntb_qflush;
+	create_random_local_eui48(sc->eaddr);
+	ether_ifattach(ifp, sc->eaddr);
+	ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 |
+	    IFCAP_JUMBO_MTU | IFCAP_LINKSTATE;
+	ifp->if_capenable = IFCAP_JUMBO_MTU | IFCAP_LINKSTATE;
+	ifp->if_mtu = sc->mtu - ETHER_HDR_LEN;
+
+	ifmedia_init(&sc->media, IFM_IMASK, ntb_ifmedia_upd,
+	    ntb_ifmedia_sts);
+	ifmedia_add(&sc->media, NTB_MEDIATYPE, 0, NULL);
+	ifmedia_set(&sc->media, NTB_MEDIATYPE);
+
+	for (i = 0; i < sc->num_queues; i++)
+		ntb_transport_link_up(sc->queues[i].qp);
 	return (0);
 }
 
 static int
-ntb_teardown_interface(void)
+ntb_net_detach(device_t dev)
 {
+	struct ntb_net_ctx *sc = device_get_softc(dev);
+	struct ntb_net_queue *q;
+	int i;
 
-	if (net_softc.qp != NULL) {
-		ntb_transport_link_down(net_softc.qp);
-
-		ntb_transport_free_queue(net_softc.qp);
-		ntb_transport_free(&net_softc);
-	}
-
-	if (net_softc.ifp != NULL) {
-		ether_ifdetach(net_softc.ifp);
-		if_free(net_softc.ifp);
-		net_softc.ifp = NULL;
-	}
-
+	for (i = 0; i < sc->num_queues; i++)
+		ntb_transport_link_down(sc->queues[i].qp);
+	ether_ifdetach(sc->ifp);
+	if_free(sc->ifp);
+	ifmedia_removeall(&sc->media);
+	for (i = 0; i < sc->num_queues; i++) {
+		q = &sc->queues[i];
+		ntb_transport_free_queue(q->qp);
+		buf_ring_free(q->br, M_DEVBUF);
+		callout_drain(&q->queue_full);
+		taskqueue_drain_all(q->tx_tq);
+		mtx_destroy(&q->tx_lock);
+	}
+	free(sc->queues, M_DEVBUF);
 	return (0);
 }
 
@@ -476,27 +220,26 @@ ntb_teardown_interface(void)
 static void
 ntb_net_init(void *arg)
 {
-	struct ntb_transport_ctx *ntb_softc = arg;
-	struct ifnet *ifp = ntb_softc->ifp;
+	struct ntb_net_ctx *sc = arg;
+	struct ifnet *ifp = sc->ifp;
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
-	ifp->if_flags |= IFF_UP;
-	if_link_state_change(ifp, LINK_STATE_UP);
+	if_link_state_change(ifp, ntb_transport_link_query(sc->queues[0].qp) ?
+	    LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static int
 ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
-	struct ntb_transport_ctx *nt = ifp->if_softc;
+	struct ntb_net_ctx *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFMTU:
 	    {
-		if (ifr->ifr_mtu > ntb_transport_max_size(nt->qp) -
-		    ETHER_HDR_LEN - ETHER_CRC_LEN) {
+		if (ifr->ifr_mtu > sc->mtu - ETHER_HDR_LEN) {
 			error = EINVAL;
 			break;
 		}
@@ -504,1185 +247,242 @@ ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	    }
-	default:
-		error = ether_ioctl(ifp, command, data);
-		break;
-	}
-
-	return (error);
-}
 
+	case SIOCSIFMEDIA:
+	case SIOCGIFMEDIA:
+		error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
+		break;
 
-static void
-ntb_start(struct ifnet *ifp)
-{
-	struct mbuf *m_head;
-	struct ntb_transport_ctx *nt = ifp->if_softc;
-	int rc;
-
-	mtx_lock(&nt->tx_lock);
-	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
-	CTR0(KTR_NTB, "TX: ntb_start");
-	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
-		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
-		CTR1(KTR_NTB, "TX: start mbuf %p", m_head);
-		rc = ntb_transport_tx_enqueue(nt->qp, m_head, m_head,
-			     m_length(m_head, NULL));
-		if (rc != 0) {
-			CTR1(KTR_NTB,
-			    "TX: could not tx mbuf %p. Returning to snd q",
-			    m_head);
-			if (rc == EAGAIN) {
-				ifp->if_drv_flags |= IFF_DRV_OACTIVE;
-				IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
-				callout_reset(&nt->qp->queue_full, hz / 1000,
-				    ntb_qp_full, ifp);
-			}
-			break;
+	case SIOCSIFCAP:
+		if (ifr->ifr_reqcap & IFCAP_RXCSUM)
+			ifp->if_capenable |= IFCAP_RXCSUM;
+		else
+			ifp->if_capenable &= ~IFCAP_RXCSUM;
+		if (ifr->ifr_reqcap & IFCAP_TXCSUM) {
+			ifp->if_capenable |= IFCAP_TXCSUM;
+			ifp->if_hwassist |= NTB_CSUM_FEATURES;
+		} else {
+			ifp->if_capenable &= ~IFCAP_TXCSUM;
+			ifp->if_hwassist &= ~NTB_CSUM_FEATURES;
+		}
+		if (ifr->ifr_reqcap & IFCAP_RXCSUM_IPV6)
+			ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
+		else
+			ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
+		if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6) {
+			ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
+			ifp->if_hwassist |= NTB_CSUM_FEATURES6;
+		} else {
+			ifp->if_capenable &= ~IFCAP_TXCSUM_IPV6;
+			ifp->if_hwassist &= ~NTB_CSUM_FEATURES6;
 		}
-
-	}
-	mtx_unlock(&nt->tx_lock);
-}
-
-/* Network Device Callbacks */
-static void
-ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
-    int len)
-{
-
-	m_freem(data);
-	CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
-}
-
-static void
-ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
-    int len)
-{
-	struct mbuf *m = data;
-	struct ifnet *ifp = qp_data;
-
-	CTR0(KTR_NTB, "RX: rx handler");
-	(*ifp->if_input)(ifp, m);
-}
-
-static void
-ntb_net_event_handler(void *data, enum ntb_link_event status)
-{
-	struct ifnet *ifp;
-
-	ifp = data;
-	(void)ifp;
-
-	/* XXX The Linux driver munges with the carrier status here. */
-
-	switch (status) {
-	case NTB_LINK_DOWN:
-		break;
-	case NTB_LINK_UP:
 		break;
-	default:
-		panic("Bogus ntb_link_event %u\n", status);
-	}
-}
-
-/* Transport Init and teardown */
-
-static void
-xeon_link_watchdog_hb(void *arg)
-{
-	struct ntb_transport_ctx *nt;
-
-	nt = arg;
-	ntb_spad_write(nt->ntb, IF_NTB_WATCHDOG_SPAD, 0);
-	callout_reset(&nt->link_watchdog, 1 * hz, xeon_link_watchdog_hb, nt);
-}
-
-static int
-ntb_transport_probe(struct ntb_softc *ntb)
-{
-	struct ntb_transport_ctx *nt = &net_softc;
-	struct ntb_transport_mw *mw;
-	uint64_t qp_bitmap;
-	int rc;
-	unsigned i;
-
-	nt->mw_count = ntb_mw_count(ntb);
-	for (i = 0; i < nt->mw_count; i++) {
-		mw = &nt->mw_vec[i];
-
-		rc = ntb_mw_get_range(ntb, i, &mw->phys_addr, &mw->vbase,
-		    &mw->phys_size, &mw->xlat_align, &mw->xlat_align_size,
-		    &mw->addr_limit);
-		if (rc != 0)
-			goto err;
-
-		mw->buff_size = 0;
-		mw->xlat_size = 0;
-		mw->virt_addr = NULL;
-		mw->dma_addr = 0;
-
-		rc = ntb_mw_set_wc(nt->ntb, i, VM_MEMATTR_WRITE_COMBINING);
-		if (rc)
-			ntb_printf(0, "Unable to set mw%d caching\n", i);
-	}
-
-	qp_bitmap = ntb_db_valid_mask(ntb);
-	nt->qp_count = flsll(qp_bitmap);
-	KASSERT(nt->qp_count != 0, ("bogus db bitmap"));
-	nt->qp_count -= 1;
-
-	if (max_num_clients != 0 && max_num_clients < nt->qp_count)
-		nt->qp_count = max_num_clients;
-	else if (nt->mw_count < nt->qp_count)
-		nt->qp_count = nt->mw_count;
-	KASSERT(nt->qp_count <= QP_SETSIZE, ("invalid qp_count"));
-
-	mtx_init(&nt->tx_lock, "ntb transport tx", NULL, MTX_DEF);
-	mtx_init(&nt->rx_lock, "ntb transport rx", NULL, MTX_DEF);
-
-	nt->qp_vec = malloc(nt->qp_count * sizeof(*nt->qp_vec), M_NTB_IF,
-	    M_WAITOK | M_ZERO);
-
-	for (i = 0; i < nt->qp_count; i++) {
-		set_bit(i, &nt->qp_bitmap);
-		set_bit(i, &nt->qp_bitmap_free);
-		ntb_transport_init_queue(nt, i);
-	}
-
-	callout_init(&nt->link_work, 0);
-	callout_init(&nt->link_watchdog, 0);
-	TASK_INIT(&nt->link_cleanup, 0, ntb_transport_link_cleanup_work, nt);
-
-	rc = ntb_set_ctx(ntb, nt, &ntb_transport_ops);
-	if (rc != 0)
-		goto err;
-
-	nt->link_is_up = false;
-	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
-	ntb_link_event(ntb);
-
-	callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
-	if (enable_xeon_watchdog != 0)
-		callout_reset(&nt->link_watchdog, 0, xeon_link_watchdog_hb, nt);
-	return (0);
-
-err:
-	free(nt->qp_vec, M_NTB_IF);
-	nt->qp_vec = NULL;
-	return (rc);
-}
-
-static void
-ntb_transport_free(struct ntb_transport_ctx *nt)
-{
-	struct ntb_softc *ntb = nt->ntb;
-	struct _qpset qp_bitmap_alloc;
-	uint8_t i;
-
-	ntb_transport_link_cleanup(nt);
-	taskqueue_drain(taskqueue_swi, &nt->link_cleanup);
-	callout_drain(&nt->link_work);
-	callout_drain(&nt->link_watchdog);
-
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
-	BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
-	/* Verify that all the QPs are freed */
-	for (i = 0; i < nt->qp_count; i++)
-		if (test_bit(i, &qp_bitmap_alloc))
-			ntb_transport_free_queue(&nt->qp_vec[i]);
-
-	ntb_link_disable(ntb);
-	ntb_clear_ctx(ntb);
-
-	for (i = 0; i < nt->mw_count; i++)
-		ntb_free_mw(nt, i);
-
-	free(nt->qp_vec, M_NTB_IF);
-}
-
-static void
-ntb_transport_init_queue(struct ntb_transport_ctx *nt, unsigned int qp_num)
-{
-	struct ntb_transport_mw *mw;
-	struct ntb_transport_qp *qp;
-	vm_paddr_t mw_base;
-	uint64_t mw_size, qp_offset;
-	size_t tx_size;
-	unsigned num_qps_mw, mw_num, mw_count;
-
-	mw_count = nt->mw_count;
-	mw_num = QP_TO_MW(nt, qp_num);
-	mw = &nt->mw_vec[mw_num];
-
-	qp = &nt->qp_vec[qp_num];
-	qp->qp_num = qp_num;
-	qp->transport = nt;
-	qp->ntb = nt->ntb;
-	qp->client_ready = false;
-	qp->event_handler = NULL;
-	ntb_qp_link_down_reset(qp);
-
-	if (nt->qp_count % mw_count && mw_num + 1 < nt->qp_count / mw_count)
-		num_qps_mw = nt->qp_count / mw_count + 1;
-	else
-		num_qps_mw = nt->qp_count / mw_count;
-
-	mw_base = mw->phys_addr;
-	mw_size = mw->phys_size;
-
-	tx_size = mw_size / num_qps_mw;
-	qp_offset = tx_size * (qp_num / mw_count);
-
-	qp->tx_mw = mw->vbase + qp_offset;
-	KASSERT(qp->tx_mw != NULL, ("uh oh?"));
-
-	/* XXX Assumes that a vm_paddr_t is equivalent to bus_addr_t */
-	qp->tx_mw_phys = mw_base + qp_offset;
-	KASSERT(qp->tx_mw_phys != 0, ("uh oh?"));
-
-	tx_size -= sizeof(struct ntb_rx_info);
-	qp->rx_info = (void *)(qp->tx_mw + tx_size);
-
-	/* Due to house-keeping, there must be at least 2 buffs */
-	qp->tx_max_frame = qmin(tx_size / 2,
-	    transport_mtu + sizeof(struct ntb_payload_header));
-	qp->tx_max_entry = tx_size / qp->tx_max_frame;
-
-	callout_init(&qp->link_work, 0);
-	callout_init(&qp->queue_full, CALLOUT_MPSAFE);
-	callout_init(&qp->rx_full, CALLOUT_MPSAFE);
-
-	mtx_init(&qp->ntb_rx_q_lock, "ntb rx q", NULL, MTX_SPIN);
-	mtx_init(&qp->ntb_tx_free_q_lock, "ntb tx free q", NULL, MTX_SPIN);
-	TASK_INIT(&qp->rx_completion_task, 0, ntb_complete_rxc, qp);
-	TASK_INIT(&qp->rxc_db_work, 0, ntb_transport_rxc_db, qp);
-
-	STAILQ_INIT(&qp->rx_post_q);
-	STAILQ_INIT(&qp->rx_pend_q);
-	STAILQ_INIT(&qp->tx_free_q);
 
-	callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
-}
-
-static void
-ntb_transport_free_queue(struct ntb_transport_qp *qp)
-{
-	struct ntb_queue_entry *entry;
-
-	if (qp == NULL)
-		return;
-
-	callout_drain(&qp->link_work);
-
-	ntb_db_set_mask(qp->ntb, 1ull << qp->qp_num);
-	taskqueue_drain(taskqueue_swi, &qp->rxc_db_work);
-	taskqueue_drain(taskqueue_swi, &qp->rx_completion_task);
-
-	qp->cb_data = NULL;
-	qp->rx_handler = NULL;
-	qp->tx_handler = NULL;
-	qp->event_handler = NULL;
-
-	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_pend_q)))
-		free(entry, M_NTB_IF);
-
-	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_post_q)))
-		free(entry, M_NTB_IF);
-
-	while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
-		free(entry, M_NTB_IF);
-
-	set_bit(qp->qp_num, &qp->transport->qp_bitmap_free);
-}
-
-/**
- * ntb_transport_create_queue - Create a new NTB transport layer queue
- * @rx_handler: receive callback function
- * @tx_handler: transmit callback function
- * @event_handler: event callback function
- *
- * Create a new NTB transport layer queue and provide the queue with a callback
- * routine for both transmit and receive.  The receive callback routine will be
- * used to pass up data when the transport has received it on the queue.   The
- * transmit callback routine will be called when the transport has completed the
- * transmission of the data on the queue and the data is ready to be freed.
- *
- * RETURNS: pointer to newly created ntb_queue, NULL on error.
- */
-static struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct ntb_softc *ntb,
-    const struct ntb_queue_handlers *handlers)
-{
-	struct ntb_queue_entry *entry;
-	struct ntb_transport_qp *qp;
-	struct ntb_transport_ctx *nt;
-	unsigned int free_queue;
-	int i;
-
-	nt = ntb_get_ctx(ntb, NULL);
-	KASSERT(nt != NULL, ("bogus"));
-
-	free_queue = ffs_bit(&nt->qp_bitmap);
-	if (free_queue == 0)
-		return (NULL);
-
-	/* decrement free_queue to make it zero based */
-	free_queue--;
-
-	qp = &nt->qp_vec[free_queue];
-	clear_bit(qp->qp_num, &nt->qp_bitmap_free);
-	qp->cb_data = data;
-	qp->rx_handler = handlers->rx_handler;
-	qp->tx_handler = handlers->tx_handler;
-	qp->event_handler = handlers->event_handler;
-
-	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = malloc(sizeof(*entry), M_NTB_IF, M_WAITOK | M_ZERO);
-		entry->cb_data = nt->ifp;
-		entry->buf = NULL;
-		entry->len = transport_mtu;
-		ntb_list_add(&qp->ntb_rx_q_lock, entry, &qp->rx_pend_q);
-	}
-
-	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = malloc(sizeof(*entry), M_NTB_IF, M_WAITOK | M_ZERO);
-		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+	default:
+		error = ether_ioctl(ifp, command, data);
+		break;
 	}
 
-	ntb_db_clear(ntb, 1ull << qp->qp_num);
-	ntb_db_clear_mask(ntb, 1ull << qp->qp_num);
-	return (qp);
-}
-
-/**
- * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
- * @qp: NTB transport layer queue to be enabled
- *
- * Notify NTB transport layer of client readiness to use queue
- */
-static void
-ntb_transport_link_up(struct ntb_transport_qp *qp)
-{
-	struct ntb_transport_ctx *nt;
-
-	if (qp == NULL)
-		return;
-
-	qp->client_ready = true;
-
-	nt = qp->transport;
-	ntb_printf(2, "qp client ready\n");
-
-	if (qp->transport->link_is_up)
-		callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+	return (error);
 }
 
-
-
-/* Transport Tx */
-
-/**
- * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
- * @qp: NTB transport layer queue the entry is to be enqueued on
- * @cb: per buffer pointer for callback function to use
- * @data: pointer to data buffer that will be sent
- * @len: length of the data buffer
- *
- * Enqueue a new transmit buffer onto the transport queue from which a NTB
- * payload will be transmitted.  This assumes that a lock is being held to
- * serialize access to the qp.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
 static int
-ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-    unsigned int len)
+ntb_ifmedia_upd(struct ifnet *ifp)
 {
-	struct ntb_queue_entry *entry;
-	int rc;
+	struct ntb_net_ctx *sc = ifp->if_softc;
+	struct ifmedia *ifm = &sc->media;
 
-	if (qp == NULL || !qp->link_is_up || len == 0) {
-		CTR0(KTR_NTB, "TX: link not up");
+	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
 		return (EINVAL);
-	}
-
-	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
-	if (entry == NULL) {
-		CTR0(KTR_NTB, "TX: could not get entry from tx_free_q");
-		qp->tx_err_no_buf++;
-		return (EBUSY);
-	}
-	CTR1(KTR_NTB, "TX: got entry %p from tx_free_q", entry);
-
-	entry->cb_data = cb;
-	entry->buf = data;
-	entry->len = len;
-	entry->flags = 0;
-
-	rc = ntb_process_tx(qp, entry);
-	if (rc != 0) {
-		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
-		CTR1(KTR_NTB,
-		    "TX: process_tx failed. Returning entry %p to tx_free_q",
-		    entry);
-	}
-	return (rc);
-}
-
-static int
-ntb_process_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
-{
-	void *offset;
-
-	offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
-	CTR3(KTR_NTB,
-	    "TX: process_tx: tx_pkts=%lu, tx_index=%u, remote entry=%u",
-	    qp->tx_pkts, qp->tx_index, qp->remote_rx_info->entry);
-	if (qp->tx_index == qp->remote_rx_info->entry) {
-		CTR0(KTR_NTB, "TX: ring full");
-		qp->tx_ring_full++;
-		return (EAGAIN);
-	}
-
-	if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
-		if (qp->tx_handler != NULL)
-			qp->tx_handler(qp, qp->cb_data, entry->buf,
-			    EIO);
-		else
-			m_freem(entry->buf);
-
-		entry->buf = NULL;
-		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
-		CTR1(KTR_NTB,
-		    "TX: frame too big. returning entry %p to tx_free_q",
-		    entry);
-		return (0);
-	}
-	CTR2(KTR_NTB, "TX: copying entry %p to offset %p", entry, offset);
-	ntb_memcpy_tx(qp, entry, offset);
-
-	qp->tx_index++;
-	qp->tx_index %= qp->tx_max_entry;
-
-	qp->tx_pkts++;
 
 	return (0);
 }
 
 static void
-ntb_memcpy_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
-    void *offset)
-{
-	struct ntb_payload_header *hdr;
-
-	/* This piece is from Linux' ntb_async_tx() */
-	hdr = (struct ntb_payload_header *)((char *)offset + qp->tx_max_frame -
-	    sizeof(struct ntb_payload_header));
-	entry->x_hdr = hdr;
-	iowrite32(entry->len, &hdr->len);
-	iowrite32(qp->tx_pkts, &hdr->ver);
-
-	/* This piece is ntb_memcpy_tx() */
-	CTR2(KTR_NTB, "TX: copying %d bytes to offset %p", entry->len, offset);
-	if (entry->buf != NULL) {
-		m_copydata((struct mbuf *)entry->buf, 0, entry->len, offset);
-
-		/*
-		 * Ensure that the data is fully copied before setting the
-		 * flags
-		 */
-		wmb();
-	}
-
-	/* The rest is ntb_tx_copy_callback() */
-	iowrite32(entry->flags | IF_NTB_DESC_DONE_FLAG, &hdr->flags);
-	CTR1(KTR_NTB, "TX: hdr %p set DESC_DONE", hdr);
-
-	ntb_peer_db_set(qp->ntb, 1ull << qp->qp_num);
-
-	/*
-	 * The entry length can only be zero if the packet is intended to be a
-	 * "link down" or similar.  Since no payload is being sent in these
-	 * cases, there is nothing to add to the completion queue.
-	 */
-	if (entry->len > 0) {
-		qp->tx_bytes += entry->len;
-
-		if (qp->tx_handler)
-			qp->tx_handler(qp, qp->cb_data, entry->buf,
-			    entry->len);
-		else
-			m_freem(entry->buf);
-		entry->buf = NULL;
-	}
-
-	CTR3(KTR_NTB,
-	    "TX: entry %p sent. hdr->ver = %u, hdr->flags = 0x%x, Returning "
-	    "to tx_free_q", entry, hdr->ver, hdr->flags);
-	ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
-}
-
-static void
-ntb_qp_full(void *arg)
+ntb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
+	struct ntb_net_ctx *sc = ifp->if_softc;
 
-	CTR0(KTR_NTB, "TX: qp_full callout");
-	ntb_start(arg);
+	ifmr->ifm_status = IFM_AVALID;
+	ifmr->ifm_active = NTB_MEDIATYPE;
+	if (ntb_transport_link_query(sc->queues[0].qp))
+		ifmr->ifm_status |= IFM_ACTIVE;
 }
 
-/* Transport Rx */
 static void
-ntb_transport_rxc_db(void *arg, int pending __unused)
+ntb_transmit_locked(struct ntb_net_queue *q)
 {
-	struct ntb_transport_qp *qp = arg;
-	ntb_q_idx_t i;
-	int rc;
-
-	/*
-	 * Limit the number of packets processed in a single interrupt to
-	 * provide fairness to others
-	 */
-	CTR0(KTR_NTB, "RX: transport_rx");
-	mtx_lock(&qp->transport->rx_lock);
-	for (i = 0; i < qp->rx_max_entry; i++) {
-		rc = ntb_process_rxc(qp);
+	struct ifnet *ifp = q->ifp;
+	struct mbuf *m;
+	int rc, len;
+	short mflags;
+
+	CTR0(KTR_NTB, "TX: ntb_transmit_locked");
+	while ((m = drbr_peek(ifp, q->br)) != NULL) {
+		CTR1(KTR_NTB, "TX: start mbuf %p", m);
+		ETHER_BPF_MTAP(ifp, m);
+		len = m->m_pkthdr.len;
+		mflags = m->m_flags;
+		rc = ntb_transport_tx_enqueue(q->qp, m, m, len);
 		if (rc != 0) {
-			CTR0(KTR_NTB, "RX: process_rxc failed");
+			CTR2(KTR_NTB, "TX: could not tx mbuf %p: %d", m, rc);
+			if (rc == EAGAIN) {
+				drbr_putback(ifp, q->br, m);
+				callout_reset_sbt(&q->queue_full,
+				    SBT_1MS / 4, SBT_1MS / 4,
+				    ntb_qp_full, q, 0);
+			} else {
+				m_freem(m);
+				drbr_advance(ifp, q->br);
+				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+			}
 			break;
 		}
-	}
-	mtx_unlock(&qp->transport->rx_lock);
-
-	if (i == qp->rx_max_entry)
-		taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
-	else if ((ntb_db_read(qp->ntb) & (1ull << qp->qp_num)) != 0) {
-		/* If db is set, clear it and read it back to commit clear. */
-		ntb_db_clear(qp->ntb, 1ull << qp->qp_num);
-		(void)ntb_db_read(qp->ntb);
-
-		/*
-		 * An interrupt may have arrived between finishing
-		 * ntb_process_rxc and clearing the doorbell bit: there might
-		 * be some more work to do.
-		 */
-		taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
+		drbr_advance(ifp, q->br);
+		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
+		if (mflags & M_MCAST)
+			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	}
 }
 
 static int
-ntb_process_rxc(struct ntb_transport_qp *qp)
+ntb_transmit(struct ifnet *ifp, struct mbuf *m)
 {
-	struct ntb_payload_header *hdr;
-	struct ntb_queue_entry *entry;
-	caddr_t offset;
-
-	offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
-	hdr = (void *)(offset + qp->rx_max_frame -
-	    sizeof(struct ntb_payload_header));
-
-	CTR1(KTR_NTB, "RX: process_rxc rx_index = %u", qp->rx_index);
-	if ((hdr->flags & IF_NTB_DESC_DONE_FLAG) == 0) {
-		CTR0(KTR_NTB, "RX: hdr not done");
-		qp->rx_ring_empty++;
-		return (EAGAIN);
-	}
-
-	if ((hdr->flags & IF_NTB_LINK_DOWN_FLAG) != 0) {
-		CTR0(KTR_NTB, "RX: link down");
-		ntb_qp_link_down(qp);
-		hdr->flags = 0;
-		return (EAGAIN);
-	}
-
-	if (hdr->ver != (uint32_t)qp->rx_pkts) {
-		CTR2(KTR_NTB,"RX: ver != rx_pkts (%x != %lx). "
-		    "Returning entry to rx_pend_q", hdr->ver, qp->rx_pkts);
-		qp->rx_err_ver++;
-		return (EIO);
-	}
-
-	entry = ntb_list_mv(&qp->ntb_rx_q_lock, &qp->rx_pend_q, &qp->rx_post_q);
-	if (entry == NULL) {
-		qp->rx_err_no_buf++;
-		CTR0(KTR_NTB, "RX: No entries in rx_pend_q");
-		return (EAGAIN);
-	}
-	callout_stop(&qp->rx_full);
-	CTR1(KTR_NTB, "RX: rx entry %p from rx_pend_q", entry);
-
-	entry->x_hdr = hdr;
-	entry->index = qp->rx_index;
-
-	if (hdr->len > entry->len) {
-		CTR2(KTR_NTB, "RX: len too long. Wanted %ju got %ju",
-		    (uintmax_t)hdr->len, (uintmax_t)entry->len);
-		qp->rx_err_oflow++;
-
-		entry->len = -EIO;
-		entry->flags |= IF_NTB_DESC_DONE_FLAG;
+	struct ntb_net_ctx *sc = ifp->if_softc;
+	struct ntb_net_queue *q;
+	int error, i;
 
-		taskqueue_enqueue(taskqueue_swi, &qp->rx_completion_task);
-	} else {
-		qp->rx_bytes += hdr->len;
-		qp->rx_pkts++;
-
-		CTR1(KTR_NTB, "RX: received %ld rx_pkts", qp->rx_pkts);
-
-		entry->len = hdr->len;
-
-		ntb_memcpy_rx(qp, entry, offset);
-	}
-
-	qp->rx_index++;
-	qp->rx_index %= qp->rx_max_entry;
+	CTR0(KTR_NTB, "TX: ntb_transmit");
+	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+		i = m->m_pkthdr.flowid % sc->num_queues;
+	else
+		i = curcpu % sc->num_queues;
+	q = &sc->queues[i];
+
+	error = drbr_enqueue(ifp, q->br, m);
+	if (error)
+		return (error);
+
+	if (mtx_trylock(&q->tx_lock)) {
+		ntb_transmit_locked(q);
+		mtx_unlock(&q->tx_lock);
+	} else
+		taskqueue_enqueue(q->tx_tq, &q->tx_task);
 	return (0);
 }
 
 static void
-ntb_memcpy_rx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
-    void *offset)
+ntb_handle_tx(void *arg, int pending)
 {
-	struct ifnet *ifp = entry->cb_data;
-	unsigned int len = entry->len;
-	struct mbuf *m;
-
-	CTR2(KTR_NTB, "RX: copying %d bytes from offset %p", len, offset);
-	m = m_devget(offset, len, 0, ifp, NULL);
-	m->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
+	struct ntb_net_queue *q = arg;
 
-	entry->buf = (void *)m;
-
-	/* Ensure that the data is globally visible before clearing the flag */
-	wmb();
-
-	CTR2(KTR_NTB, "RX: copied entry %p to mbuf %p.", entry, m);
-	ntb_rx_copy_callback(qp, entry);
+	mtx_lock(&q->tx_lock);
+	ntb_transmit_locked(q);
+	mtx_unlock(&q->tx_lock);
 }
 
-static inline void
-ntb_rx_copy_callback(struct ntb_transport_qp *qp, void *data)
+static void
+ntb_qp_full(void *arg)
 {
-	struct ntb_queue_entry *entry;
+	struct ntb_net_queue *q = arg;
 
-	entry = data;
-	entry->flags |= IF_NTB_DESC_DONE_FLAG;
-	taskqueue_enqueue(taskqueue_swi, &qp->rx_completion_task);
+	CTR0(KTR_NTB, "TX: qp_full callout");
+	if (ntb_transport_tx_free_entry(q->qp) > 0)
+		taskqueue_enqueue(q->tx_tq, &q->tx_task);
+	else
+		callout_schedule_sbt(&q->queue_full,
+		    SBT_1MS / 4, SBT_1MS / 4, 0);
 }
 
 static void
-ntb_complete_rxc(void *arg, int pending)
+ntb_qflush(struct ifnet *ifp)
 {
-	struct ntb_transport_qp *qp = arg;
-	struct ntb_queue_entry *entry;
+	struct ntb_net_ctx *sc = ifp->if_softc;
+	struct ntb_net_queue *q;
 	struct mbuf *m;
-	unsigned len;
-
-	CTR0(KTR_NTB, "RX: rx_completion_task");
-
-	mtx_lock_spin(&qp->ntb_rx_q_lock);
-
-	while (!STAILQ_EMPTY(&qp->rx_post_q)) {
-		entry = STAILQ_FIRST(&qp->rx_post_q);
-		if ((entry->flags & IF_NTB_DESC_DONE_FLAG) == 0)
-			break;
-
-		entry->x_hdr->flags = 0;
-		iowrite32(entry->index, &qp->rx_info->entry);
-
-		STAILQ_REMOVE_HEAD(&qp->rx_post_q, entry);
-
-		len = entry->len;
-		m = entry->buf;
-
-		/*
-		 * Re-initialize queue_entry for reuse; rx_handler takes
-		 * ownership of the mbuf.
-		 */
-		entry->buf = NULL;
-		entry->len = transport_mtu;
-		entry->cb_data = qp->transport->ifp;
-
-		STAILQ_INSERT_TAIL(&qp->rx_pend_q, entry, entry);
-
-		mtx_unlock_spin(&qp->ntb_rx_q_lock);
+	int i;
 
-		CTR2(KTR_NTB, "RX: completing entry %p, mbuf %p", entry, m);
-		if (qp->rx_handler != NULL && qp->client_ready)
-			qp->rx_handler(qp, qp->cb_data, m, len);
-		else
+	for (i = 0; i < sc->num_queues; i++) {
+		q = &sc->queues[i];
+		mtx_lock(&q->tx_lock);
+		while ((m = buf_ring_dequeue_sc(q->br)) != NULL)
 			m_freem(m);
-
-		mtx_lock_spin(&qp->ntb_rx_q_lock);
-	}
-
-	mtx_unlock_spin(&qp->ntb_rx_q_lock);
-}
-
-static void
-ntb_transport_doorbell_callback(void *data, uint32_t vector)
-{
-	struct ntb_transport_ctx *nt = data;
-	struct ntb_transport_qp *qp;
-	struct _qpset db_bits;
-	uint64_t vec_mask;
-	unsigned qp_num;
-
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &db_bits);
-	BIT_NAND(QP_SETSIZE, &db_bits, &nt->qp_bitmap_free);
-
-	vec_mask = ntb_db_vector_mask(nt->ntb, vector);
-	while (vec_mask != 0) {
-		qp_num = ffsll(vec_mask) - 1;
-
-		if (test_bit(qp_num, &db_bits)) {
-			qp = &nt->qp_vec[qp_num];
-			taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
-		}
-
-		vec_mask &= ~(1ull << qp_num);
-	}
-}
-
-/* Link Event handler */
-static void
-ntb_transport_event_callback(void *data)
-{
-	struct ntb_transport_ctx *nt = data;
-
-	if (ntb_link_is_up(nt->ntb, NULL, NULL)) {
-		ntb_printf(1, "HW link up\n");
-		callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
-	} else {
-		ntb_printf(1, "HW link down\n");
-		taskqueue_enqueue(taskqueue_swi, &nt->link_cleanup);
+		mtx_unlock(&q->tx_lock);
 	}
+	if_qflush(ifp);
 }
 
-/* Link bring up */
+/* Network Device Callbacks */
 static void
-ntb_transport_link_work(void *arg)
-{
-	struct ntb_transport_ctx *nt = arg;
-	struct ntb_softc *ntb = nt->ntb;
-	struct ntb_transport_qp *qp;
-	uint64_t val64, size;
-	uint32_t val;
-	unsigned i;
-	int rc;
-
-	/* send the local info, in the opposite order of the way we read it */
-	for (i = 0; i < nt->mw_count; i++) {
-		size = nt->mw_vec[i].phys_size;
-
-		if (max_mw_size != 0 && size > max_mw_size)
-			size = max_mw_size;
-
-		ntb_peer_spad_write(ntb, IF_NTB_MW0_SZ_HIGH + (i * 2),
-		    size >> 32);
-		ntb_peer_spad_write(ntb, IF_NTB_MW0_SZ_LOW + (i * 2), size);
-	}
-
-	ntb_peer_spad_write(ntb, IF_NTB_NUM_MWS, nt->mw_count);
-
-	ntb_peer_spad_write(ntb, IF_NTB_NUM_QPS, nt->qp_count);
-
-	ntb_peer_spad_write(ntb, IF_NTB_VERSION, NTB_TRANSPORT_VERSION);
-
-	/* Query the remote side for its info */
-	val = 0;
-	ntb_spad_read(ntb, IF_NTB_VERSION, &val);
-	if (val != NTB_TRANSPORT_VERSION)
-		goto out;
-
-	ntb_spad_read(ntb, IF_NTB_NUM_QPS, &val);
-	if (val != nt->qp_count)
-		goto out;
-
-	ntb_spad_read(ntb, IF_NTB_NUM_MWS, &val);
-	if (val != nt->mw_count)
-		goto out;
-
-	for (i = 0; i < nt->mw_count; i++) {
-		ntb_spad_read(ntb, IF_NTB_MW0_SZ_HIGH + (i * 2), &val);
-		val64 = (uint64_t)val << 32;
-
-		ntb_spad_read(ntb, IF_NTB_MW0_SZ_LOW + (i * 2), &val);
-		val64 |= val;
-
-		rc = ntb_set_mw(nt, i, val64);
-		if (rc != 0)
-			goto free_mws;
-	}
-
-	nt->link_is_up = true;
-	ntb_printf(1, "transport link up\n");
-
-	for (i = 0; i < nt->qp_count; i++) {
-		qp = &nt->qp_vec[i];
-
-		ntb_transport_setup_qp_mw(nt, i);
-
-		if (qp->client_ready)
-			callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
-	}
-
-	return;
-
-free_mws:
-	for (i = 0; i < nt->mw_count; i++)
-		ntb_free_mw(nt, i);
-out:
-	if (ntb_link_is_up(ntb, NULL, NULL))
-		callout_reset(&nt->link_work,
-		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_transport_link_work, nt);
-}
-
-static int
-ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, size_t size)
+ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
+    int len)
 {
-	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
-	size_t xlat_size, buff_size;
-	int rc;
-
-	if (size == 0)
-		return (EINVAL);
-
-	xlat_size = roundup(size, mw->xlat_align_size);
-	buff_size = xlat_size;
-
-	/* No need to re-setup */
-	if (mw->xlat_size == xlat_size)
-		return (0);
-
-	if (mw->buff_size != 0)
-		ntb_free_mw(nt, num_mw);
-
-	/* Alloc memory for receiving data.  Must be aligned */
-	mw->xlat_size = xlat_size;
-	mw->buff_size = buff_size;
-
-	mw->virt_addr = contigmalloc(mw->buff_size, M_NTB_IF, M_ZERO, 0,
-	    mw->addr_limit, mw->xlat_align, 0);
-	if (mw->virt_addr == NULL) {
-		ntb_printf(0, "Unable to allocate MW buffer of size %zu/%zu\n",
-		    mw->buff_size, mw->xlat_size);
-		mw->xlat_size = 0;
-		mw->buff_size = 0;
-		return (ENOMEM);
-	}
-	/* TODO: replace with bus_space_* functions */
-	mw->dma_addr = vtophys(mw->virt_addr);
-
-	/*
-	 * Ensure that the allocation from contigmalloc is aligned as
-	 * requested.  XXX: This may not be needed -- brought in for parity
-	 * with the Linux driver.
-	 */
-	if (mw->dma_addr % mw->xlat_align != 0) {
-		ntb_printf(0,
-		    "DMA memory 0x%jx not aligned to BAR size 0x%zx\n",
-		    (uintmax_t)mw->dma_addr, size);
-		ntb_free_mw(nt, num_mw);
-		return (ENOMEM);
-	}
-
-	/* Notify HW the memory location of the receive buffer */
-	rc = ntb_mw_set_trans(nt->ntb, num_mw, mw->dma_addr, mw->xlat_size);
-	if (rc) {
-		ntb_printf(0, "Unable to set mw%d translation\n", num_mw);
-		ntb_free_mw(nt, num_mw);
-		return (rc);
-	}
 
-	return (0);
+	m_freem(data);
+	CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
 }
 
 static void
-ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
+ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
+    int len)
 {
-	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	struct ntb_net_queue *q = qp_data;
+	struct ntb_net_ctx *sc = q->sc;
+	struct mbuf *m = data;
+	struct ifnet *ifp = q->ifp;
+	uint16_t proto;
 
-	if (mw->virt_addr == NULL)
+	CTR1(KTR_NTB, "RX: rx handler (%d)", len);
+	if (len < 0) {
+		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return;
-
-	ntb_mw_clear_trans(nt->ntb, num_mw);
-	contigfree(mw->virt_addr, mw->xlat_size, M_NTB_IF);
-	mw->xlat_size = 0;
-	mw->buff_size = 0;
-	mw->virt_addr = NULL;
-}
-
-static int
-ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, unsigned int qp_num)
-{
-	struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
-	struct ntb_transport_mw *mw;
-	void *offset;
-	ntb_q_idx_t i;
-	size_t rx_size;
-	unsigned num_qps_mw, mw_num, mw_count;
-
-	mw_count = nt->mw_count;
-	mw_num = QP_TO_MW(nt, qp_num);
-	mw = &nt->mw_vec[mw_num];
-
-	if (mw->virt_addr == NULL)
-		return (ENOMEM);
-
-	if (nt->qp_count % mw_count && mw_num + 1 < nt->qp_count / mw_count)
-		num_qps_mw = nt->qp_count / mw_count + 1;
-	else
-		num_qps_mw = nt->qp_count / mw_count;
-
-	rx_size = mw->xlat_size / num_qps_mw;
-	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
-	rx_size -= sizeof(struct ntb_rx_info);
-
-	qp->remote_rx_info = (void*)(qp->rx_buff + rx_size);
-
-	/* Due to house-keeping, there must be at least 2 buffs */
-	qp->rx_max_frame = qmin(rx_size / 2,
-	    transport_mtu + sizeof(struct ntb_payload_header));
-	qp->rx_max_entry = rx_size / qp->rx_max_frame;
-	qp->rx_index = 0;
-
-	qp->remote_rx_info->entry = qp->rx_max_entry - 1;
-
-	/* Set up the hdr offsets with 0s */
-	for (i = 0; i < qp->rx_max_entry; i++) {
-		offset = (void *)(qp->rx_buff + qp->rx_max_frame * (i + 1) -
-		    sizeof(struct ntb_payload_header));
-		memset(offset, 0, sizeof(struct ntb_payload_header));
 	}
 
-	qp->rx_pkts = 0;
-	qp->tx_pkts = 0;
-	qp->tx_index = 0;
-
-	return (0);
-}
-
-static void
-ntb_qp_link_work(void *arg)
-{
-	struct ntb_transport_qp *qp = arg;
-	struct ntb_softc *ntb = qp->ntb;
-	struct ntb_transport_ctx *nt = qp->transport;
-	uint32_t val, dummy;
-
-	ntb_spad_read(ntb, IF_NTB_QP_LINKS, &val);
-
-	ntb_peer_spad_write(ntb, IF_NTB_QP_LINKS, val | (1ull << qp->qp_num));
-
-	/* query remote spad for qp ready bits */
-	ntb_peer_spad_read(ntb, IF_NTB_QP_LINKS, &dummy);
-
-	/* See if the remote side is up */
-	if ((val & (1ull << qp->qp_num)) != 0) {
-		ntb_printf(2, "qp link up\n");
-		qp->link_is_up = true;
-
-		if (qp->event_handler != NULL)
-			qp->event_handler(qp->cb_data, NTB_LINK_UP);
-
-		taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
-	} else if (nt->link_is_up)
-		callout_reset(&qp->link_work,
-		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
-}
-
-/* Link down event*/
-static void
-ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
-{
-	struct ntb_transport_qp *qp;
-	struct _qpset qp_bitmap_alloc;
-	unsigned i;
-
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
-	BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
-	/* Pass along the info to any clients */
-	for (i = 0; i < nt->qp_count; i++)
-		if (test_bit(i, &qp_bitmap_alloc)) {
-			qp = &nt->qp_vec[i];
-			ntb_qp_link_cleanup(qp);
-			callout_drain(&qp->link_work);
-		}
-
-	if (!nt->link_is_up)
-		callout_drain(&nt->link_work);
-
-	/*
-	 * The scratchpad registers keep the values if the remote side
-	 * goes down, blast them now to give them a sane value the next
-	 * time they are accessed
-	 */
-	for (i = 0; i < IF_NTB_MAX_SPAD; i++)
-		ntb_spad_write(nt->ntb, i, 0);
-}
-
-static void
-ntb_transport_link_cleanup_work(void *arg, int pending __unused)
-{
-
-	ntb_transport_link_cleanup(arg);
-}
-
-static void
-ntb_qp_link_down(struct ntb_transport_qp *qp)
-{
-
-	ntb_qp_link_cleanup(qp);
-}
-
-static void
-ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
-{
-
-	qp->link_is_up = false;
-
-	qp->tx_index = qp->rx_index = 0;
-	qp->tx_bytes = qp->rx_bytes = 0;
-	qp->tx_pkts = qp->rx_pkts = 0;
-
-	qp->rx_ring_empty = 0;
-	qp->tx_ring_full = 0;
-
-	qp->rx_err_no_buf = qp->tx_err_no_buf = 0;
-	qp->rx_err_oflow = qp->rx_err_ver = 0;
-}
-
-static void
-ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
-{
-	struct ntb_transport_ctx *nt = qp->transport;
-
-	callout_drain(&qp->link_work);
-	ntb_qp_link_down_reset(qp);
-
-	if (qp->event_handler != NULL)
-		qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
-
-	if (nt->link_is_up)
-		callout_reset(&qp->link_work,
-		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
-}
-
-/* Link commanded down */
-/**
- * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
- * @qp: NTB transport layer queue to be disabled
- *
- * Notify NTB transport layer of client's desire to no longer receive data on
- * transport queue specified.  It is the client's responsibility to ensure all
- * entries on queue are purged or otherwise handled appropriately.
- */
-static void
-ntb_transport_link_down(struct ntb_transport_qp *qp)
-{
-	uint32_t val;
-
-	if (qp == NULL)
-		return;
-
-	qp->client_ready = false;
-
-	ntb_spad_read(qp->ntb, IF_NTB_QP_LINKS, &val);
-
-	ntb_peer_spad_write(qp->ntb, IF_NTB_QP_LINKS,
-	   val & ~(1 << qp->qp_num));
-
-	if (qp->link_is_up)
-		ntb_send_link_down(qp);
-	else
-		callout_drain(&qp->link_work);
-}
-
-static void
-ntb_send_link_down(struct ntb_transport_qp *qp)
-{
-	struct ntb_queue_entry *entry;
-	int i, rc;
-
-	if (!qp->link_is_up)
-		return;
-
-	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
-		entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
-		if (entry != NULL)
+	m->m_pkthdr.rcvif = ifp;
+	if (sc->num_queues > 1) {
+		m->m_pkthdr.flowid = q - sc->queues;
+		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
+	}
+	if (ifp->if_capenable & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
+		m_copydata(m, 12, 2, (void *)&proto);
+		switch (ntohs(proto)) {
+		case ETHERTYPE_IP:
+			if (ifp->if_capenable & IFCAP_RXCSUM) {
+				m->m_pkthdr.csum_data = 0xffff;
+				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
+			}
+			break;
+		case ETHERTYPE_IPV6:
+			if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
+				m->m_pkthdr.csum_data = 0xffff;
+				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
+			}
 			break;
-		pause("NTB Wait for link down", hz / 10);
+		}
 	}
-
-	if (entry == NULL)
-		return;
-
-	entry->cb_data = NULL;
-	entry->buf = NULL;
-	entry->len = 0;
-	entry->flags = IF_NTB_LINK_DOWN_FLAG;
-
-	mtx_lock(&qp->transport->tx_lock);
-	rc = ntb_process_tx(qp, entry);
-	if (rc != 0)
-		printf("ntb: Failed to send link down\n");
-	mtx_unlock(&qp->transport->tx_lock);
-
-	ntb_qp_link_down_reset(qp);
+	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
+	ifp->if_input(ifp, m);
 }
 
-
-/* List Management */
-
 static void
-ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
-    struct ntb_queue_list *list)
-{
-
-	mtx_lock_spin(lock);
-	STAILQ_INSERT_TAIL(list, entry, entry);
-	mtx_unlock_spin(lock);
-}
-
-static struct ntb_queue_entry *
-ntb_list_rm(struct mtx *lock, struct ntb_queue_list *list)
-{
-	struct ntb_queue_entry *entry;
-
-	mtx_lock_spin(lock);
-	if (STAILQ_EMPTY(list)) {
-		entry = NULL;
-		goto out;
-	}
-	entry = STAILQ_FIRST(list);
-	STAILQ_REMOVE_HEAD(list, entry);
-out:
-	mtx_unlock_spin(lock);
-
-	return (entry);
-}
-
-static struct ntb_queue_entry *
-ntb_list_mv(struct mtx *lock, struct ntb_queue_list *from,
-    struct ntb_queue_list *to)
+ntb_net_event_handler(void *data, enum ntb_link_event status)
 {
-	struct ntb_queue_entry *entry;
+	struct ntb_net_queue *q = data;
+	int new_state;
 
-	mtx_lock_spin(lock);
-	if (STAILQ_EMPTY(from)) {
-		entry = NULL;
-		goto out;
+	switch (status) {
+	case NTB_LINK_DOWN:
+		new_state = LINK_STATE_DOWN;
+		break;
+	case NTB_LINK_UP:
+		new_state = LINK_STATE_UP;
+		break;
+	default:
+		new_state = LINK_STATE_UNKNOWN;
+		break;
 	}
-	entry = STAILQ_FIRST(from);
-	STAILQ_REMOVE_HEAD(from, entry);
-	STAILQ_INSERT_TAIL(to, entry, entry);
-
-out:
-	mtx_unlock_spin(lock);
-	return (entry);
+	if_link_state_change(q->ifp, new_state);
 }
 
 /* Helper functions */
@@ -1693,27 +493,24 @@ static void
 create_random_local_eui48(u_char *eaddr)
 {
 	static uint8_t counter = 0;
-	uint32_t seed = ticks;
 
 	eaddr[0] = EUI48_LOCALLY_ADMINISTERED;
-	memcpy(&eaddr[1], &seed, sizeof(uint32_t));
+	arc4rand(&eaddr[1], 4, 0);
 	eaddr[5] = counter++;
 }
 
-/**
- * ntb_transport_max_size - Query the max payload size of a qp
- * @qp: NTB transport layer queue to be queried
- *
- * Query the maximum payload size permissible on the given qp
- *
- * RETURNS: the max payload size of a qp
- */
-static unsigned int
-ntb_transport_max_size(struct ntb_transport_qp *qp)
-{
-
-	if (qp == NULL)
-		return (0);
+static device_method_t ntb_net_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,     ntb_net_probe),
+	DEVMETHOD(device_attach,    ntb_net_attach),
+	DEVMETHOD(device_detach,    ntb_net_detach),
+	DEVMETHOD_END
+};
 
-	return (qp->tx_max_frame - sizeof(struct ntb_payload_header));
-}
+devclass_t ntb_net_devclass;
+static DEFINE_CLASS_0(ntb, ntb_net_driver, ntb_net_methods,
+    sizeof(struct ntb_net_ctx));
+DRIVER_MODULE(if_ntb, ntb_transport, ntb_net_driver, ntb_net_devclass,
+    NULL, NULL);
+MODULE_DEPEND(if_ntb, ntb_transport, 1, 1, 1);
+MODULE_VERSION(if_ntb, 1);
diff --git a/sys/dev/ntb/ntb.c b/sys/dev/ntb/ntb.c
new file mode 100644
index 0000000..1cf1ba2
--- /dev/null
+++ b/sys/dev/ntb/ntb.c
@@ -0,0 +1,463 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+
+#include "ntb.h"
+
+devclass_t ntb_hw_devclass;
+SYSCTL_NODE(_hw, OID_AUTO, ntb, CTLFLAG_RW, 0, "NTB sysctls");
+
+struct ntb_child {
+	device_t	dev;
+	int		enabled;
+	int		mwoff;
+	int		mwcnt;
+	int		spadoff;
+	int		spadcnt;
+	int		dboff;
+	int		dbmask;
+	void		*ctx;
+	const struct ntb_ctx_ops *ctx_ops;
+	struct rmlock	ctx_lock;
+	struct ntb_child *next;
+};
+
+int
+ntb_register_device(device_t dev)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	int i, mw, mwu, mwt, spad, spadu, spadt, db, dbu, dbt;
+	char cfg[128] = "";
+	char buf[32];
+	char *n, *np, *c, *p, *name;
+
+	mwu = 0;
+	mwt = NTB_MW_COUNT(dev);
+	spadu = 0;
+	spadt = NTB_SPAD_COUNT(dev);
+	dbu = 0;
+	dbt = flsll(NTB_DB_VALID_MASK(dev));
+
+	device_printf(dev, "%d memory windows, %d scratchpads, "
+	    "%d doorbells\n", mwt, spadt, dbt);
+
+	snprintf(buf, sizeof(buf), "hint.%s.%d.config", device_get_name(dev),
+	    device_get_unit(dev));
+	TUNABLE_STR_FETCH(buf, cfg, sizeof(cfg));
+	n = cfg;
+	i = 0;
+	while ((c = strsep(&n, ",")) != NULL) {
+		np = c;
+		name = strsep(&np, ":");
+		if (name != NULL && name[0] == 0)
+			name = NULL;
+		p = strsep(&np, ":");
+		mw = (p && p[0] != 0) ? strtol(p, NULL, 10) : mwt - mwu;
+		p = strsep(&np, ":");
+		spad = (p && p[0] != 0) ? strtol(p, NULL, 10) : spadt - spadu;
+		db = (np && np[0] != 0) ? strtol(np, NULL, 10) : dbt - dbu;
+
+		if (mw > mwt - mwu || spad > spadt - spadu || db > dbt - dbu) {
+			device_printf(dev, "Not enough resources for config\n");
+			break;
+		}
+
+		nc = malloc(sizeof(*nc), M_DEVBUF, M_WAITOK | M_ZERO);
+		nc->mwoff = mwu;
+		nc->mwcnt = mw;
+		nc->spadoff = spadu;
+		nc->spadcnt = spad;
+		nc->dboff = dbu;
+		nc->dbmask = (db == 0) ? 0 : (0xffffffffffffffff >> (64 - db));
+		rm_init(&nc->ctx_lock, "ntb ctx");
+		nc->dev = device_add_child(dev, name, -1);
+		if (nc->dev == NULL) {
+			ntb_unregister_device(dev);
+			return (ENOMEM);
+		}
+		device_set_ivars(nc->dev, nc);
+		*cpp = nc;
+		cpp = &nc->next;
+
+		if (bootverbose) {
+			device_printf(dev, "%d \"%s\":", i, name);
+			if (mw > 0) {
+				printf(" memory windows %d", mwu);
+				if (mw > 1)
+					printf("-%d", mwu + mw - 1);
+			}
+			if (spad > 0) {
+				printf(" scratchpads %d", spadu);
+				if (spad > 1)
+					printf("-%d", spadu + spad - 1);
+			}
+			if (db > 0) {
+				printf(" doorbells %d", dbu);
+				if (db > 1)
+					printf("-%d", dbu + db - 1);
+			}
+			printf("\n");
+		}
+
+		mwu += mw;
+		spadu += spad;
+		dbu += db;
+		i++;
+	}
+
+	bus_generic_attach(dev);
+	return (0);
+}
+
+int
+ntb_unregister_device(device_t dev)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	int error = 0;
+
+	while ((nc = *cpp) != NULL) {
+		*cpp = (*cpp)->next;
+		error = device_delete_child(dev, nc->dev);
+		if (error)
+			break;
+		rm_destroy(&nc->ctx_lock);
+		free(nc, M_DEVBUF);
+	}
+	return (error);
+}
+
+void
+ntb_link_event(device_t dev)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	struct rm_priotracker ctx_tracker;
+
+	for (nc = *cpp; nc != NULL; nc = nc->next) {
+		rm_rlock(&nc->ctx_lock, &ctx_tracker);
+		if (nc->ctx_ops != NULL && nc->ctx_ops->link_event != NULL)
+			nc->ctx_ops->link_event(nc->ctx);
+		rm_runlock(&nc->ctx_lock, &ctx_tracker);
+	}
+}
+
+void
+ntb_db_event(device_t dev, uint32_t vec)
+{
+	struct ntb_child **cpp = device_get_softc(dev);
+	struct ntb_child *nc;
+	struct rm_priotracker ctx_tracker;
+
+	for (nc = *cpp; nc != NULL; nc = nc->next) {
+		rm_rlock(&nc->ctx_lock, &ctx_tracker);
+		if (nc->ctx_ops != NULL && nc->ctx_ops->db_event != NULL)
+			nc->ctx_ops->db_event(nc->ctx, vec);
+		rm_runlock(&nc->ctx_lock, &ctx_tracker);
+	}
+}
+
+bool
+ntb_link_is_up(device_t ntb, enum ntb_speed *speed, enum ntb_width *width)
+{
+
+	return (NTB_LINK_IS_UP(device_get_parent(ntb), speed, width));
+}
+
+int
+ntb_link_enable(device_t ntb, enum ntb_speed speed, enum ntb_width width)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+	struct ntb_child **cpp = device_get_softc(device_get_parent(nc->dev));
+	struct ntb_child *nc1;
+
+	for (nc1 = *cpp; nc1 != NULL; nc1 = nc1->next) {
+		if (nc1->enabled) {
+			nc->enabled = 1;
+			return (0);
+		}
+	}
+	nc->enabled = 1;
+	return (NTB_LINK_ENABLE(device_get_parent(ntb), speed, width));
+}
+
+int
+ntb_link_disable(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+	struct ntb_child **cpp = device_get_softc(device_get_parent(nc->dev));
+	struct ntb_child *nc1;
+
+	if (!nc->enabled)
+		return (0);
+	nc->enabled = 0;
+	for (nc1 = *cpp; nc1 != NULL; nc1 = nc1->next) {
+		if (nc1->enabled)
+			return (0);
+	}
+	return (NTB_LINK_DISABLE(device_get_parent(ntb)));
+}
+
+bool
+ntb_link_enabled(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->enabled && NTB_LINK_ENABLED(device_get_parent(ntb)));
+}
+
+int
+ntb_set_ctx(device_t ntb, void *ctx, const struct ntb_ctx_ops *ctx_ops)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	if (ctx == NULL || ctx_ops == NULL)
+		return (EINVAL);
+
+	rm_wlock(&nc->ctx_lock);
+	if (nc->ctx_ops != NULL) {
+		rm_wunlock(&nc->ctx_lock);
+		return (EINVAL);
+	}
+	nc->ctx = ctx;
+	nc->ctx_ops = ctx_ops;
+	rm_wunlock(&nc->ctx_lock);
+
+	return (0);
+}
+
+void *
+ntb_get_ctx(device_t ntb, const struct ntb_ctx_ops **ctx_ops)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	KASSERT(nc->ctx != NULL && nc->ctx_ops != NULL, ("bogus"));
+	if (ctx_ops != NULL)
+		*ctx_ops = nc->ctx_ops;
+	return (nc->ctx);
+}
+
+void
+ntb_clear_ctx(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	rm_wlock(&nc->ctx_lock);
+	nc->ctx = NULL;
+	nc->ctx_ops = NULL;
+	rm_wunlock(&nc->ctx_lock);
+}
+
+uint8_t
+ntb_mw_count(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->mwcnt);
+}
+
+int
+ntb_mw_get_range(device_t ntb, unsigned mw_idx, vm_paddr_t *base,
+    caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
+    bus_addr_t *plimit)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_GET_RANGE(device_get_parent(ntb), mw_idx + nc->mwoff,
+	    base, vbase, size, align, align_size, plimit));
+}
+
+int
+ntb_mw_set_trans(device_t ntb, unsigned mw_idx, bus_addr_t addr, size_t size)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_SET_TRANS(device_get_parent(ntb), mw_idx + nc->mwoff,
+	    addr, size));
+}
+
+int
+ntb_mw_clear_trans(device_t ntb, unsigned mw_idx)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_CLEAR_TRANS(device_get_parent(ntb), mw_idx + nc->mwoff));
+}
+
+int
+ntb_mw_get_wc(device_t ntb, unsigned mw_idx, vm_memattr_t *mode)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_GET_WC(device_get_parent(ntb), mw_idx + nc->mwoff, mode));
+}
+
+int
+ntb_mw_set_wc(device_t ntb, unsigned mw_idx, vm_memattr_t mode)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_MW_SET_WC(device_get_parent(ntb), mw_idx + nc->mwoff, mode));
+}
+
+uint8_t
+ntb_spad_count(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->spadcnt);
+}
+
+void
+ntb_spad_clear(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+	unsigned i;
+
+	for (i = 0; i < nc->spadcnt; i++)
+		NTB_SPAD_WRITE(device_get_parent(ntb), i + nc->spadoff, 0);
+}
+
+int
+ntb_spad_write(device_t ntb, unsigned int idx, uint32_t val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_SPAD_WRITE(device_get_parent(ntb), idx + nc->spadoff, val));
+}
+
+int
+ntb_spad_read(device_t ntb, unsigned int idx, uint32_t *val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_SPAD_READ(device_get_parent(ntb), idx + nc->spadoff, val));
+}
+
+int
+ntb_peer_spad_write(device_t ntb, unsigned int idx, uint32_t val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_PEER_SPAD_WRITE(device_get_parent(ntb), idx + nc->spadoff,
+	    val));
+}
+
+int
+ntb_peer_spad_read(device_t ntb, unsigned int idx, uint32_t *val)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_PEER_SPAD_READ(device_get_parent(ntb), idx + nc->spadoff,
+	    val));
+}
+
+uint64_t
+ntb_db_valid_mask(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (nc->dbmask);
+}
+
+int
+ntb_db_vector_count(device_t ntb)
+{
+
+	return (NTB_DB_VECTOR_COUNT(device_get_parent(ntb)));
+}
+
+uint64_t
+ntb_db_vector_mask(device_t ntb, uint32_t vector)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return ((NTB_DB_VECTOR_MASK(device_get_parent(ntb), vector)
+	    >> nc->dboff) & nc->dbmask);
+}
+
+int
+ntb_peer_db_addr(device_t ntb, bus_addr_t *db_addr, vm_size_t *db_size)
+{
+
+	return (NTB_PEER_DB_ADDR(device_get_parent(ntb), db_addr, db_size));
+}
+
+void
+ntb_db_clear(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_DB_CLEAR(device_get_parent(ntb), bits << nc->dboff));
+}
+
+void
+ntb_db_clear_mask(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_DB_CLEAR_MASK(device_get_parent(ntb), bits << nc->dboff));
+}
+
+uint64_t
+ntb_db_read(device_t ntb)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return ((NTB_DB_READ(device_get_parent(ntb)) >> nc->dboff)
+	    & nc->dbmask);
+}
+
+void
+ntb_db_set_mask(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_DB_SET_MASK(device_get_parent(ntb), bits << nc->dboff));
+}
+
+void
+ntb_peer_db_set(device_t ntb, uint64_t bits)
+{
+	struct ntb_child *nc = device_get_ivars(ntb);
+
+	return (NTB_PEER_DB_SET(device_get_parent(ntb), bits << nc->dboff));
+}
+
+MODULE_VERSION(ntb, 1);
diff --git a/sys/dev/ntb/ntb.h b/sys/dev/ntb/ntb.h
new file mode 100644
index 0000000..8593c65
--- /dev/null
+++ b/sys/dev/ntb/ntb.h
@@ -0,0 +1,409 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NTB_H_
+#define _NTB_H_
+
+#include "ntb_if.h"
+
+extern devclass_t ntb_hw_devclass;
+SYSCTL_DECL(_hw_ntb);
+
+int ntb_register_device(device_t ntb);
+int ntb_unregister_device(device_t ntb);
+
+/*
+ * ntb_link_event() - notify driver context of a change in link status
+ * @ntb:        NTB device context
+ *
+ * Notify the driver context that the link status may have changed.  The driver
+ * should call intb_link_is_up() to get the current status.
+ */
+void ntb_link_event(device_t ntb);
+
+/*
+ * ntb_db_event() - notify driver context of a doorbell event
+ * @ntb:        NTB device context
+ * @vector:     Interrupt vector number
+ *
+ * Notify the driver context of a doorbell event.  If hardware supports
+ * multiple interrupt vectors for doorbells, the vector number indicates which
+ * vector received the interrupt.  The vector number is relative to the first
+ * vector used for doorbells, starting at zero, and must be less than
+ * ntb_db_vector_count().  The driver may call ntb_db_read() to check which
+ * doorbell bits need service, and ntb_db_vector_mask() to determine which of
+ * those bits are associated with the vector number.
+ */
+void ntb_db_event(device_t ntb, uint32_t vec);
+
+/*
+ * ntb_link_is_up() - get the current ntb link state
+ * @ntb:        NTB device context
+ * @speed:      OUT - The link speed expressed as PCIe generation number
+ * @width:      OUT - The link width expressed as the number of PCIe lanes
+ *
+ * RETURNS: true or false based on the hardware link state
+ */
+bool ntb_link_is_up(device_t ntb, enum ntb_speed *speed, enum ntb_width *width);
+
+/*
+ * ntb_link_enable() - enable the link on the secondary side of the ntb
+ * @ntb:        NTB device context
+ * @max_speed:  The maximum link speed expressed as PCIe generation number[0]
+ * @max_width:  The maximum link width expressed as the number of PCIe lanes[0]
+ *
+ * Enable the link on the secondary side of the ntb.  This can only be done
+ * from the primary side of the ntb in primary or b2b topology.  The ntb device
+ * should train the link to its maximum speed and width, or the requested speed
+ * and width, whichever is smaller, if supported.
+ *
+ * Return: Zero on success, otherwise an error number.
+ *
+ * [0]: Only NTB_SPEED_AUTO and NTB_WIDTH_AUTO are valid inputs; other speed
+ *      and width input will be ignored.
+ */
+int ntb_link_enable(device_t ntb, enum ntb_speed speed, enum ntb_width width);
+
+/*
+ * ntb_link_disable() - disable the link on the secondary side of the ntb
+ * @ntb:        NTB device context
+ *
+ * Disable the link on the secondary side of the ntb.  This can only be done
+ * from the primary side of the ntb in primary or b2b topology.  The ntb device
+ * should disable the link.  Returning from this call must indicate that a
+ * barrier has passed, though with no more writes may pass in either direction
+ * across the link, except if this call returns an error number.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_link_disable(device_t ntb);
+
+/*
+ * get enable status of the link on the secondary side of the ntb
+ */
+bool ntb_link_enabled(device_t ntb);
+
+/*
+ * ntb_set_ctx() - associate a driver context with an ntb device
+ * @ntb:        NTB device context
+ * @ctx:        Driver context
+ * @ctx_ops:    Driver context operations
+ *
+ * Associate a driver context and operations with a ntb device.  The context is
+ * provided by the client driver, and the driver may associate a different
+ * context with each ntb device.
+ *
+ * Return: Zero if the context is associated, otherwise an error number.
+ */
+int ntb_set_ctx(device_t ntb, void *ctx, const struct ntb_ctx_ops *ctx_ops);
+
+/*
+ * ntb_set_ctx() - get a driver context associated with an ntb device
+ * @ntb:        NTB device context
+ * @ctx_ops:    Driver context operations
+ *
+ * Get a driver context and operations associated with a ntb device.
+ */
+void * ntb_get_ctx(device_t ntb, const struct ntb_ctx_ops **ctx_ops);
+
+/*
+ * ntb_clear_ctx() - disassociate any driver context from an ntb device
+ * @ntb:        NTB device context
+ *
+ * Clear any association that may exist between a driver context and the ntb
+ * device.
+ */
+void ntb_clear_ctx(device_t ntb);
+
+/*
+ * ntb_mw_count() - Get the number of memory windows available for KPI
+ * consumers.
+ *
+ * (Excludes any MW wholly reserved for register access.)
+ */
+uint8_t ntb_mw_count(device_t ntb);
+
+/*
+ * ntb_mw_get_range() - get the range of a memory window
+ * @ntb:        NTB device context
+ * @idx:        Memory window number
+ * @base:       OUT - the base address for mapping the memory window
+ * @size:       OUT - the size for mapping the memory window
+ * @align:      OUT - the base alignment for translating the memory window
+ * @align_size: OUT - the size alignment for translating the memory window
+ *
+ * Get the range of a memory window.  NULL may be given for any output
+ * parameter if the value is not needed.  The base and size may be used for
+ * mapping the memory window, to access the peer memory.  The alignment and
+ * size may be used for translating the memory window, for the peer to access
+ * memory on the local system.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_get_range(device_t ntb, unsigned mw_idx, vm_paddr_t *base,
+    caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
+    bus_addr_t *plimit);
+
+/*
+ * ntb_mw_set_trans() - set the translation of a memory window
+ * @ntb:        NTB device context
+ * @idx:        Memory window number
+ * @addr:       The dma address local memory to expose to the peer
+ * @size:       The size of the local memory to expose to the peer
+ *
+ * Set the translation of a memory window.  The peer may access local memory
+ * through the window starting at the address, up to the size.  The address
+ * must be aligned to the alignment specified by ntb_mw_get_range().  The size
+ * must be aligned to the size alignment specified by ntb_mw_get_range().  The
+ * address must be below the plimit specified by ntb_mw_get_range() (i.e. for
+ * 32-bit BARs).
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_set_trans(device_t ntb, unsigned mw_idx, bus_addr_t addr,
+    size_t size);
+
+/*
+ * ntb_mw_clear_trans() - clear the translation of a memory window
+ * @ntb:	NTB device context
+ * @idx:	Memory window number
+ *
+ * Clear the translation of a memory window.  The peer may no longer access
+ * local memory through the window.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_clear_trans(device_t ntb, unsigned mw_idx);
+
+/*
+ * ntb_mw_get_wc - Get the write-combine status of a memory window
+ *
+ * Returns:  Zero on success, setting *wc; otherwise an error number (e.g. if
+ * idx is an invalid memory window).
+ *
+ * Mode is a VM_MEMATTR_* type.
+ */
+int ntb_mw_get_wc(device_t ntb, unsigned mw_idx, vm_memattr_t *mode);
+
+/*
+ * ntb_mw_set_wc - Set the write-combine status of a memory window
+ *
+ * If 'mode' matches the current status, this does nothing and succeeds.  Mode
+ * is a VM_MEMATTR_* type.
+ *
+ * Returns:  Zero on success, setting the caching attribute on the virtual
+ * mapping of the BAR; otherwise an error number (e.g. if idx is an invalid
+ * memory window, or if changing the caching attribute fails).
+ */
+int ntb_mw_set_wc(device_t ntb, unsigned mw_idx, vm_memattr_t mode);
+
+/*
+ * ntb_spad_count() - get the total scratch regs usable
+ * @ntb: pointer to ntb_softc instance
+ *
+ * This function returns the max 32bit scratchpad registers usable by the
+ * upper layer.
+ *
+ * RETURNS: total number of scratch pad registers available
+ */
+uint8_t ntb_spad_count(device_t ntb);
+
+/*
+ * ntb_get_max_spads() - zero local scratch registers
+ * @ntb: pointer to ntb_softc instance
+ *
+ * This functions overwrites all local scratchpad registers with zeroes.
+ */
+void ntb_spad_clear(device_t ntb);
+
+/*
+ * ntb_spad_write() - write to the secondary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_spad_write(device_t ntb, unsigned int idx, uint32_t val);
+
+/*
+ * ntb_spad_read() - read from the primary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_spad_read(device_t ntb, unsigned int idx, uint32_t *val);
+
+/*
+ * ntb_peer_spad_write() - write to the secondary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_peer_spad_write(device_t ntb, unsigned int idx, uint32_t val);
+
+/*
+ * ntb_peer_spad_read() - read from the primary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_peer_spad_read(device_t ntb, unsigned int idx, uint32_t *val);
+
+/*
+ * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
+ * @ntb:	NTB device context
+ *
+ * Hardware may support different number or arrangement of doorbell bits.
+ *
+ * Return: A mask of doorbell bits supported by the ntb.
+ */
+uint64_t ntb_db_valid_mask(device_t ntb);
+
+/*
+ * ntb_db_vector_count() - get the number of doorbell interrupt vectors
+ * @ntb:	NTB device context.
+ *
+ * Hardware may support different number of interrupt vectors.
+ *
+ * Return: The number of doorbell interrupt vectors.
+ */
+int ntb_db_vector_count(device_t ntb);
+
+/*
+ * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
+ * @ntb:	NTB device context
+ * @vector:	Doorbell vector number
+ *
+ * Each interrupt vector may have a different number or arrangement of bits.
+ *
+ * Return: A mask of doorbell bits serviced by a vector.
+ */
+uint64_t ntb_db_vector_mask(device_t ntb, uint32_t vector);
+
+/*
+ * ntb_peer_db_addr() - address and size of the peer doorbell register
+ * @ntb:	NTB device context.
+ * @db_addr:	OUT - The address of the peer doorbell register.
+ * @db_size:	OUT - The number of bytes to write the peer doorbell register.
+ *
+ * Return the address of the peer doorbell register.  This may be used, for
+ * example, by drivers that offload memory copy operations to a dma engine.
+ * The drivers may wish to ring the peer doorbell at the completion of memory
+ * copy operations.  For efficiency, and to simplify ordering of operations
+ * between the dma memory copies and the ringing doorbell, the driver may
+ * append one additional dma memory copy with the doorbell register as the
+ * destination, after the memory copy operations.
+ *
+ * Return: Zero on success, otherwise an error number.
+ *
+ * Note that writing the peer doorbell via a memory window will *not* generate
+ * an interrupt on the remote host; that must be done separately.
+ */
+int ntb_peer_db_addr(device_t ntb, bus_addr_t *db_addr, vm_size_t *db_size);
+
+/*
+ * ntb_db_clear() - clear bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_clear(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_db_clear_mask() - clear bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell mask register, allowing doorbell interrupts
+ * from being generated for those doorbell bits.  If a doorbell bit is already
+ * set at the time the mask is cleared, and the corresponding mask bit is
+ * changed from set to clear, then the ntb driver must ensure that
+ * ntb_db_event() is called.  If the hardware does not generate the interrupt
+ * on clearing the mask bit, then the driver must call ntb_db_event() anyway.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_clear_mask(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_db_read() - read the local doorbell register
+ * @ntb:	NTB device context.
+ *
+ * Read the local doorbell register, and return the bits that are set.
+ *
+ * Return: The bits currently set in the local doorbell register.
+ */
+uint64_t ntb_db_read(device_t ntb);
+
+/*
+ * ntb_db_set_mask() - set bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell mask bits to set.
+ *
+ * Set bits in the local doorbell mask register, preventing doorbell interrupts
+ * from being generated for those doorbell bits.  Bits that were already set
+ * must remain set.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_set_mask(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_peer_db_set() - Set the doorbell on the secondary/external side
+ * @ntb: pointer to ntb_softc instance
+ * @bit: doorbell bits to ring
+ *
+ * This function allows triggering of a doorbell on the secondary/external
+ * side that will initiate an interrupt on the remote host
+ */
+void ntb_peer_db_set(device_t ntb, uint64_t bits);
+
+#endif /* _NTB_H_ */
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.c b/sys/dev/ntb/ntb_hw/ntb_hw.c
index b757f01..609aa4d 100644
--- a/sys/dev/ntb/ntb_hw/ntb_hw.c
+++ b/sys/dev/ntb/ntb_hw/ntb_hw.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
@@ -25,6 +26,16 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a driver for NTB hardware in Intel Xeon/Atom CPUs.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
@@ -33,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
+#include <sys/interrupt.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
@@ -51,19 +63,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/pci/pcivar.h>
 
 #include "ntb_regs.h"
-#include "ntb_hw.h"
-
-/*
- * The Non-Transparent Bridge (NTB) is a device on some Intel processors that
- * allows you to connect two systems using a PCI-e link.
- *
- * This module contains the hardware abstraction layer for the NTB. It allows
- * you to send and recieve interrupts, map the memory windows and send and
- * receive messages in the scratch-pad registers.
- *
- * NOTE: Much of the code in this module is shared with Linux. Any patches may
- * be picked up and redistributed in Linux with a dual GPL/BSD license.
- */
+#include "../ntb.h"
 
 #define MAX_MSIX_INTERRUPTS MAX(XEON_DB_COUNT, ATOM_DB_COUNT)
 
@@ -71,8 +71,6 @@ __FBSDID("$FreeBSD$");
 #define ATOM_LINK_RECOVERY_TIME	500 /* ms */
 #define BAR_HIGH_MASK		(~((1ull << 12) - 1))
 
-#define DEVICE2SOFTC(dev) ((struct ntb_softc *) device_get_softc(dev))
-
 #define	NTB_MSIX_VER_GUARD	0xaabbccdd
 #define	NTB_MSIX_RECEIVED	0xe0f0e0f0
 
@@ -123,8 +121,8 @@ enum {
 };
 
 /* Device features and workarounds */
-#define HAS_FEATURE(feature)	\
-	((ntb->features & (feature)) != 0)
+#define HAS_FEATURE(ntb, feature)	\
+	(((ntb)->features & (feature)) != 0)
 
 struct ntb_hw_info {
 	uint32_t		device_id;
@@ -203,6 +201,9 @@ struct ntb_msix_data {
 };
 
 struct ntb_softc {
+	/* ntb.c context. Do not move! Must go first! */
+	void			*ntb_store;
+
 	device_t		device;
 	enum ntb_device_type	type;
 	uint32_t		features;
@@ -221,13 +222,7 @@ struct ntb_softc {
 	struct callout		heartbeat_timer;
 	struct callout		lr_timer;
 
-	void			*ntb_ctx;
-	const struct ntb_ctx_ops *ctx_ops;
 	struct ntb_vec		*msix_vec;
-#define CTX_LOCK(sc)		mtx_lock(&(sc)->ctx_lock)
-#define CTX_UNLOCK(sc)		mtx_unlock(&(sc)->ctx_lock)
-#define CTX_ASSERT(sc,f)	mtx_assert(&(sc)->ctx_lock, (f))
-	struct mtx		ctx_lock;
 
 	uint32_t		ppd;
 	enum ntb_conn_type	conn_type;
@@ -259,6 +254,7 @@ struct ntb_softc {
 	uint64_t			db_valid_mask;
 	uint64_t			db_link_mask;
 	uint64_t			db_mask;
+	uint64_t			fake_db_bell;	/* NTB_SB01BASE_LOCKUP*/
 
 	int				last_ts;	/* ticks @ last irq */
 
@@ -288,61 +284,74 @@ bus_space_write_8(bus_space_tag_t tag, bus_space_handle_t handle,
 }
 #endif
 
-#define ntb_bar_read(SIZE, bar, offset) \
+#define intel_ntb_bar_read(SIZE, bar, offset) \
 	    bus_space_read_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
 	    ntb->bar_info[(bar)].pci_bus_handle, (offset))
-#define ntb_bar_write(SIZE, bar, offset, val) \
+#define intel_ntb_bar_write(SIZE, bar, offset, val) \
 	    bus_space_write_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
 	    ntb->bar_info[(bar)].pci_bus_handle, (offset), (val))
-#define ntb_reg_read(SIZE, offset) ntb_bar_read(SIZE, NTB_CONFIG_BAR, offset)
-#define ntb_reg_write(SIZE, offset, val) \
-	    ntb_bar_write(SIZE, NTB_CONFIG_BAR, offset, val)
-#define ntb_mw_read(SIZE, offset) \
-	    ntb_bar_read(SIZE, ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), offset)
-#define ntb_mw_write(SIZE, offset, val) \
-	    ntb_bar_write(SIZE, ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
+#define intel_ntb_reg_read(SIZE, offset) \
+	    intel_ntb_bar_read(SIZE, NTB_CONFIG_BAR, offset)
+#define intel_ntb_reg_write(SIZE, offset, val) \
+	    intel_ntb_bar_write(SIZE, NTB_CONFIG_BAR, offset, val)
+#define intel_ntb_mw_read(SIZE, offset) \
+	    intel_ntb_bar_read(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
+		offset)
+#define intel_ntb_mw_write(SIZE, offset, val) \
+	    intel_ntb_bar_write(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
 		offset, val)
 
-static int ntb_probe(device_t device);
-static int ntb_attach(device_t device);
-static int ntb_detach(device_t device);
-static unsigned ntb_user_mw_to_idx(struct ntb_softc *, unsigned uidx);
-static inline enum ntb_bar ntb_mw_to_bar(struct ntb_softc *, unsigned mw);
+static int intel_ntb_probe(device_t device);
+static int intel_ntb_attach(device_t device);
+static int intel_ntb_detach(device_t device);
+static uint64_t intel_ntb_db_valid_mask(device_t dev);
+static void intel_ntb_spad_clear(device_t dev);
+static uint64_t intel_ntb_db_vector_mask(device_t dev, uint32_t vector);
+static bool intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed,
+    enum ntb_width *width);
+static int intel_ntb_link_enable(device_t dev, enum ntb_speed speed,
+    enum ntb_width width);
+static int intel_ntb_link_disable(device_t dev);
+static int intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val);
+static int intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val);
+
+static unsigned intel_ntb_user_mw_to_idx(struct ntb_softc *, unsigned uidx);
+static inline enum ntb_bar intel_ntb_mw_to_bar(struct ntb_softc *, unsigned mw);
 static inline bool bar_is_64bit(struct ntb_softc *, enum ntb_bar);
 static inline void bar_get_xlat_params(struct ntb_softc *, enum ntb_bar,
     uint32_t *base, uint32_t *xlat, uint32_t *lmt);
-static int ntb_map_pci_bars(struct ntb_softc *ntb);
-static int ntb_mw_set_wc_internal(struct ntb_softc *, unsigned idx,
+static int intel_ntb_map_pci_bars(struct ntb_softc *ntb);
+static int intel_ntb_mw_set_wc_internal(struct ntb_softc *, unsigned idx,
     vm_memattr_t);
 static void print_map_success(struct ntb_softc *, struct ntb_pci_bar_info *,
     const char *);
 static int map_mmr_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar);
 static int map_memory_window_bar(struct ntb_softc *ntb,
     struct ntb_pci_bar_info *bar);
-static void ntb_unmap_pci_bar(struct ntb_softc *ntb);
-static int ntb_remap_msix(device_t, uint32_t desired, uint32_t avail);
-static int ntb_init_isr(struct ntb_softc *ntb);
-static int ntb_setup_legacy_interrupt(struct ntb_softc *ntb);
-static int ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors);
-static void ntb_teardown_interrupts(struct ntb_softc *ntb);
-static inline uint64_t ntb_vec_mask(struct ntb_softc *, uint64_t db_vector);
-static void ntb_interrupt(struct ntb_softc *, uint32_t vec);
+static void intel_ntb_unmap_pci_bar(struct ntb_softc *ntb);
+static int intel_ntb_remap_msix(device_t, uint32_t desired, uint32_t avail);
+static int intel_ntb_init_isr(struct ntb_softc *ntb);
+static int intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb);
+static int intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors);
+static void intel_ntb_teardown_interrupts(struct ntb_softc *ntb);
+static inline uint64_t intel_ntb_vec_mask(struct ntb_softc *, uint64_t db_vector);
+static void intel_ntb_interrupt(struct ntb_softc *, uint32_t vec);
 static void ndev_vec_isr(void *arg);
 static void ndev_irq_isr(void *arg);
 static inline uint64_t db_ioread(struct ntb_softc *, uint64_t regoff);
 static inline void db_iowrite(struct ntb_softc *, uint64_t regoff, uint64_t);
 static inline void db_iowrite_raw(struct ntb_softc *, uint64_t regoff, uint64_t);
-static int ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
-static void ntb_free_msix_vec(struct ntb_softc *ntb);
-static void ntb_get_msix_info(struct ntb_softc *ntb);
-static void ntb_exchange_msix(void *);
-static struct ntb_hw_info *ntb_get_device_info(uint32_t device_id);
-static void ntb_detect_max_mw(struct ntb_softc *ntb);
-static int ntb_detect_xeon(struct ntb_softc *ntb);
-static int ntb_detect_atom(struct ntb_softc *ntb);
-static int ntb_xeon_init_dev(struct ntb_softc *ntb);
-static int ntb_atom_init_dev(struct ntb_softc *ntb);
-static void ntb_teardown_xeon(struct ntb_softc *ntb);
+static int intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
+static void intel_ntb_free_msix_vec(struct ntb_softc *ntb);
+static void intel_ntb_get_msix_info(struct ntb_softc *ntb);
+static void intel_ntb_exchange_msix(void *);
+static struct ntb_hw_info *intel_ntb_get_device_info(uint32_t device_id);
+static void intel_ntb_detect_max_mw(struct ntb_softc *ntb);
+static int intel_ntb_detect_xeon(struct ntb_softc *ntb);
+static int intel_ntb_detect_atom(struct ntb_softc *ntb);
+static int intel_ntb_xeon_init_dev(struct ntb_softc *ntb);
+static int intel_ntb_atom_init_dev(struct ntb_softc *ntb);
+static void intel_ntb_teardown_xeon(struct ntb_softc *ntb);
 static void configure_atom_secondary_side_bars(struct ntb_softc *ntb);
 static void xeon_reset_sbar_size(struct ntb_softc *, enum ntb_bar idx,
     enum ntb_bar regbar);
@@ -352,18 +361,16 @@ static void xeon_set_pbar_xlat(struct ntb_softc *, uint64_t base_addr,
     enum ntb_bar idx);
 static int xeon_setup_b2b_mw(struct ntb_softc *,
     const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr);
-static int xeon_setup_msix_bar(struct ntb_softc *);
 static inline bool link_is_up(struct ntb_softc *ntb);
 static inline bool _xeon_link_is_up(struct ntb_softc *ntb);
 static inline bool atom_link_is_err(struct ntb_softc *ntb);
-static inline enum ntb_speed ntb_link_sta_speed(struct ntb_softc *);
-static inline enum ntb_width ntb_link_sta_width(struct ntb_softc *);
+static inline enum ntb_speed intel_ntb_link_sta_speed(struct ntb_softc *);
+static inline enum ntb_width intel_ntb_link_sta_width(struct ntb_softc *);
 static void atom_link_hb(void *arg);
-static void ntb_db_event(struct ntb_softc *ntb, uint32_t vec);
 static void recover_atom_link(void *arg);
-static bool ntb_poll_link(struct ntb_softc *ntb);
+static bool intel_ntb_poll_link(struct ntb_softc *ntb);
 static void save_bar_parameters(struct ntb_pci_bar_info *bar);
-static void ntb_sysctl_init(struct ntb_softc *);
+static void intel_ntb_sysctl_init(struct ntb_softc *);
 static int sysctl_handle_features(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS);
@@ -374,7 +381,7 @@ static unsigned g_ntb_hw_debug_level;
 TUNABLE_INT("hw.ntb.debug_level", &g_ntb_hw_debug_level);
 SYSCTL_UINT(_hw_ntb, OID_AUTO, debug_level, CTLFLAG_RWTUN,
     &g_ntb_hw_debug_level, 0, "ntb_hw log level -- higher is more verbose");
-#define ntb_printf(lvl, ...) do {				\
+#define intel_ntb_printf(lvl, ...) do {				\
 	if ((lvl) <= g_ntb_hw_debug_level) {			\
 		device_printf(ntb->device, __VA_ARGS__);	\
 	}							\
@@ -398,7 +405,7 @@ SYSCTL_UINT(_hw_ntb, OID_AUTO, default_mw_pat, CTLFLAG_RDTUN,
     "UC-: " __XSTRING(_NTB_PAT_UCM));
 
 static inline vm_memattr_t
-ntb_pat_flags(void)
+intel_ntb_pat_flags(void)
 {
 
 	switch (g_ntb_mw_pat) {
@@ -424,7 +431,7 @@ ntb_pat_flags(void)
  * anywhere better yet.
  */
 static inline const char *
-ntb_vm_memattr_to_str(vm_memattr_t pat)
+intel_ntb_vm_memattr_to_str(vm_memattr_t pat)
 {
 
 	switch (pat) {
@@ -445,7 +452,8 @@ ntb_vm_memattr_to_str(vm_memattr_t pat)
 	}
 }
 
-static int g_ntb_msix_idx = 0;
+static int g_ntb_msix_idx = 1;
+TUNABLE_INT("hw.ntb.msix_mw_idx", &g_ntb_msix_idx);
 SYSCTL_INT(_hw_ntb, OID_AUTO, msix_mw_idx, CTLFLAG_RDTUN, &g_ntb_msix_idx,
     0, "Use this memory window to access the peer MSIX message complex on "
     "certain Xeon-based NTB systems, as a workaround for a hardware errata.  "
@@ -461,6 +469,18 @@ SYSCTL_INT(_hw_ntb, OID_AUTO, b2b_mw_idx, CTLFLAG_RDTUN, &g_ntb_mw_idx,
     "available memory window.  Both sides of the NTB MUST set the same "
     "value here!  (Applies on Xeon platforms with SDOORBELL_LOCKUP errata.)");
 
+/* Hardware owns the low 16 bits of features. */
+#define NTB_BAR_SIZE_4K		(1 << 0)
+#define NTB_SDOORBELL_LOCKUP	(1 << 1)
+#define NTB_SB01BASE_LOCKUP	(1 << 2)
+#define NTB_B2BDOORBELL_BIT14	(1 << 3)
+/* Software/configuration owns the top 16 bits. */
+#define NTB_SPLIT_BAR		(1ull << 16)
+
+#define NTB_FEATURES_STR \
+    "\20\21SPLIT_BAR4\04B2B_DOORBELL_BIT14\03SB01BASE_LOCKUP" \
+    "\02SDOORBELL_LOCKUP\01BAR_SIZE_4K"
+
 static struct ntb_hw_info pci_ids[] = {
 	/* XXX: PS/SS IDs left out until they are supported. */
 	{ 0x0C4E8086, "BWD Atom Processor S1200 Non-Transparent Bridge B2B",
@@ -609,35 +629,15 @@ SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, dsd_bar5_addr32, CTLFLAG_RDTUN,
  */
 MALLOC_DEFINE(M_NTB, "ntb_hw", "ntb_hw driver memory allocations");
 
-static device_method_t ntb_pci_methods[] = {
-	/* Device interface */
-	DEVMETHOD(device_probe,     ntb_probe),
-	DEVMETHOD(device_attach,    ntb_attach),
-	DEVMETHOD(device_detach,    ntb_detach),
-	DEVMETHOD_END
-};
-
-static driver_t ntb_pci_driver = {
-	"ntb_hw",
-	ntb_pci_methods,
-	sizeof(struct ntb_softc),
-};
-
-static devclass_t ntb_devclass;
-DRIVER_MODULE(ntb_hw, pci, ntb_pci_driver, ntb_devclass, NULL, NULL);
-MODULE_VERSION(ntb_hw, 1);
-
-SYSCTL_NODE(_hw, OID_AUTO, ntb, CTLFLAG_RW, 0, "NTB sysctls");
-
 /*
  * OS <-> Driver linkage functions
  */
 static int
-ntb_probe(device_t device)
+intel_ntb_probe(device_t device)
 {
 	struct ntb_hw_info *p;
 
-	p = ntb_get_device_info(pci_get_devid(device));
+	p = intel_ntb_get_device_info(pci_get_devid(device));
 	if (p == NULL)
 		return (ENXIO);
 
@@ -646,14 +646,14 @@ ntb_probe(device_t device)
 }
 
 static int
-ntb_attach(device_t device)
+intel_ntb_attach(device_t device)
 {
 	struct ntb_softc *ntb;
 	struct ntb_hw_info *p;
 	int error;
 
-	ntb = DEVICE2SOFTC(device);
-	p = ntb_get_device_info(pci_get_devid(device));
+	ntb = device_get_softc(device);
+	p = intel_ntb_get_device_info(pci_get_devid(device));
 
 	ntb->device = device;
 	ntb->type = p->type;
@@ -666,47 +666,52 @@ ntb_attach(device_t device)
 	callout_init(&ntb->lr_timer, CALLOUT_MPSAFE);
 	callout_init(&ntb->peer_msix_work, 1);
 	mtx_init(&ntb->db_mask_lock, "ntb hw bits", NULL, MTX_SPIN);
-	mtx_init(&ntb->ctx_lock, "ntb ctx", NULL, MTX_DEF);
 
 	if (ntb->type == NTB_ATOM)
-		error = ntb_detect_atom(ntb);
+		error = intel_ntb_detect_atom(ntb);
 	else
-		error = ntb_detect_xeon(ntb);
+		error = intel_ntb_detect_xeon(ntb);
 	if (error != 0)
 		goto out;
 
-	ntb_detect_max_mw(ntb);
+	intel_ntb_detect_max_mw(ntb);
 
 	pci_enable_busmaster(ntb->device);
 
-	error = ntb_map_pci_bars(ntb);
+	error = intel_ntb_map_pci_bars(ntb);
 	if (error != 0)
 		goto out;
 	if (ntb->type == NTB_ATOM)
-		error = ntb_atom_init_dev(ntb);
+		error = intel_ntb_atom_init_dev(ntb);
 	else
-		error = ntb_xeon_init_dev(ntb);
+		error = intel_ntb_xeon_init_dev(ntb);
 	if (error != 0)
 		goto out;
 
-	ntb_spad_clear(ntb);
+	intel_ntb_spad_clear(device);
+
+	intel_ntb_poll_link(ntb);
 
-	ntb_poll_link(ntb);
+	intel_ntb_sysctl_init(ntb);
 
-	ntb_sysctl_init(ntb);
+	/* Attach children to this controller */
+	error = ntb_register_device(device);
 
 out:
 	if (error != 0)
-		ntb_detach(device);
+		intel_ntb_detach(device);
 	return (error);
 }
 
 static int
-ntb_detach(device_t device)
+intel_ntb_detach(device_t device)
 {
 	struct ntb_softc *ntb;
 
-	ntb = DEVICE2SOFTC(device);
+	ntb = device_get_softc(device);
+
+	/* Detach & delete all children */
+	ntb_unregister_device(device);
 
 	if (ntb->self_reg != NULL) {
 		DB_MASK_LOCK(ntb);
@@ -718,13 +723,12 @@ ntb_detach(device_t device)
 	callout_drain(&ntb->peer_msix_work);
 	pci_disable_busmaster(ntb->device);
 	if (ntb->type == NTB_XEON)
-		ntb_teardown_xeon(ntb);
-	ntb_teardown_interrupts(ntb);
+		intel_ntb_teardown_xeon(ntb);
+	intel_ntb_teardown_interrupts(ntb);
 
 	mtx_destroy(&ntb->db_mask_lock);
-	mtx_destroy(&ntb->ctx_lock);
 
-	ntb_unmap_pci_bar(ntb);
+	intel_ntb_unmap_pci_bar(ntb);
 
 	return (0);
 }
@@ -733,7 +737,7 @@ ntb_detach(device_t device)
  * Driver internal routines
  */
 static inline enum ntb_bar
-ntb_mw_to_bar(struct ntb_softc *ntb, unsigned mw)
+intel_ntb_mw_to_bar(struct ntb_softc *ntb, unsigned mw)
 {
 
 	KASSERT(mw < ntb->mw_count,
@@ -748,7 +752,7 @@ bar_is_64bit(struct ntb_softc *ntb, enum ntb_bar bar)
 {
 	/* XXX This assertion could be stronger. */
 	KASSERT(bar < NTB_MAX_BARS, ("bogus bar"));
-	return (bar < NTB_B2B_BAR_2 || !HAS_FEATURE(NTB_SPLIT_BAR));
+	return (bar < NTB_B2B_BAR_2 || !HAS_FEATURE(ntb, NTB_SPLIT_BAR));
 }
 
 static inline void
@@ -789,7 +793,7 @@ bar_get_xlat_params(struct ntb_softc *ntb, enum ntb_bar bar, uint32_t *base,
 }
 
 static int
-ntb_map_pci_bars(struct ntb_softc *ntb)
+intel_ntb_map_pci_bars(struct ntb_softc *ntb)
 {
 	int rc;
 
@@ -814,7 +818,7 @@ ntb_map_pci_bars(struct ntb_softc *ntb)
 	ntb->bar_info[NTB_B2B_BAR_2].ssz_off = XEON_SBAR4SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off = XEON_PBAR4XLAT_OFFSET;
 
-	if (!HAS_FEATURE(NTB_SPLIT_BAR))
+	if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		goto out;
 
 	ntb->bar_info[NTB_B2B_BAR_3].pci_resource_id = PCIR_BAR(5);
@@ -888,7 +892,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 	 * but the PCI driver does not honor the size in this call, so we have
 	 * to modify it after the fact.
 	 */
-	if (HAS_FEATURE(NTB_BAR_SIZE_4K)) {
+	if (HAS_FEATURE(ntb, NTB_BAR_SIZE_4K)) {
 		if (bar->pci_resource_id == PCIR_BAR(2))
 			bar_size_bits = pci_read_config(ntb->device,
 			    XEON_PBAR23SZ_OFFSET, 1);
@@ -915,7 +919,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 	 * Optionally, mark MW BARs as anything other than UC to improve
 	 * performance.
 	 */
-	mapmode = ntb_pat_flags();
+	mapmode = intel_ntb_pat_flags();
 	if (mapmode == bar->map_mode)
 		return (0);
 
@@ -928,7 +932,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 		    PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
 		    (char *)bar->vbase + bar->size - 1,
 		    (void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
-		    ntb_vm_memattr_to_str(mapmode));
+		    intel_ntb_vm_memattr_to_str(mapmode));
 	} else
 		device_printf(ntb->device,
 		    "Unable to mark BAR%d v:[%p-%p] p:[%p-%p] as "
@@ -936,13 +940,13 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 		    PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
 		    (char *)bar->vbase + bar->size - 1,
 		    (void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
-		    ntb_vm_memattr_to_str(mapmode), rc);
+		    intel_ntb_vm_memattr_to_str(mapmode), rc);
 		/* Proceed anyway */
 	return (0);
 }
 
 static void
-ntb_unmap_pci_bar(struct ntb_softc *ntb)
+intel_ntb_unmap_pci_bar(struct ntb_softc *ntb)
 {
 	struct ntb_pci_bar_info *current_bar;
 	int i;
@@ -957,7 +961,7 @@ ntb_unmap_pci_bar(struct ntb_softc *ntb)
 }
 
 static int
-ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors)
+intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors)
 {
 	uint32_t i;
 	int rc;
@@ -1012,7 +1016,7 @@ SYSCTL_INT(_hw_ntb, OID_AUTO, prefer_intx_to_remap, CTLFLAG_RDTUN,
  * round-robin fashion.
  */
 static int
-ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
+intel_ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
 {
 	u_int *vectors;
 	uint32_t i;
@@ -1032,7 +1036,7 @@ ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
 }
 
 static int
-ntb_init_isr(struct ntb_softc *ntb)
+intel_ntb_init_isr(struct ntb_softc *ntb)
 {
 	uint32_t desired_vectors, num_vectors;
 	int rc;
@@ -1058,7 +1062,7 @@ ntb_init_isr(struct ntb_softc *ntb)
 			num_vectors--;
 
 		if (rc == 0 && num_vectors < desired_vectors) {
-			rc = ntb_remap_msix(ntb->device, desired_vectors,
+			rc = intel_ntb_remap_msix(ntb->device, desired_vectors,
 			    num_vectors);
 			if (rc == 0)
 				num_vectors = desired_vectors;
@@ -1071,7 +1075,7 @@ ntb_init_isr(struct ntb_softc *ntb)
 		num_vectors = 1;
 
 	if (ntb->type == NTB_XEON && num_vectors < ntb->db_vec_count) {
-		if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			device_printf(ntb->device,
 			    "Errata workaround does not support MSI or INTX\n");
 			return (EINVAL);
@@ -1079,32 +1083,30 @@ ntb_init_isr(struct ntb_softc *ntb)
 
 		ntb->db_vec_count = 1;
 		ntb->db_vec_shift = XEON_DB_TOTAL_SHIFT;
-		rc = ntb_setup_legacy_interrupt(ntb);
+		rc = intel_ntb_setup_legacy_interrupt(ntb);
 	} else {
 		if (num_vectors - 1 != XEON_NONLINK_DB_MSIX_BITS &&
-		    HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		    HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			device_printf(ntb->device,
 			    "Errata workaround expects %d doorbell bits\n",
 			    XEON_NONLINK_DB_MSIX_BITS);
 			return (EINVAL);
 		}
 
-		ntb_create_msix_vec(ntb, num_vectors);
-		rc = ntb_setup_msix(ntb, num_vectors);
-		if (rc == 0 && HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-			ntb_get_msix_info(ntb);
+		intel_ntb_create_msix_vec(ntb, num_vectors);
+		rc = intel_ntb_setup_msix(ntb, num_vectors);
 	}
 	if (rc != 0) {
 		device_printf(ntb->device,
 		    "Error allocating interrupts: %d\n", rc);
-		ntb_free_msix_vec(ntb);
+		intel_ntb_free_msix_vec(ntb);
 	}
 
 	return (rc);
 }
 
 static int
-ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
+intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
 {
 	int rc;
 
@@ -1131,7 +1133,7 @@ ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
 }
 
 static void
-ntb_teardown_interrupts(struct ntb_softc *ntb)
+intel_ntb_teardown_interrupts(struct ntb_softc *ntb)
 {
 	struct ntb_int_info *current_int;
 	int i;
@@ -1147,7 +1149,7 @@ ntb_teardown_interrupts(struct ntb_softc *ntb)
 			    rman_get_rid(current_int->res), current_int->res);
 	}
 
-	ntb_free_msix_vec(ntb);
+	intel_ntb_free_msix_vec(ntb);
 	pci_release_msi(ntb->device);
 }
 
@@ -1160,11 +1162,11 @@ db_ioread(struct ntb_softc *ntb, uint64_t regoff)
 {
 
 	if (ntb->type == NTB_ATOM)
-		return (ntb_reg_read(8, regoff));
+		return (intel_ntb_reg_read(8, regoff));
 
 	KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
 
-	return (ntb_reg_read(2, regoff));
+	return (intel_ntb_reg_read(2, regoff));
 }
 
 static inline void
@@ -1186,89 +1188,78 @@ db_iowrite_raw(struct ntb_softc *ntb, uint64_t regoff, uint64_t val)
 {
 
 	if (ntb->type == NTB_ATOM) {
-		ntb_reg_write(8, regoff, val);
+		intel_ntb_reg_write(8, regoff, val);
 		return;
 	}
 
 	KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
-	ntb_reg_write(2, regoff, (uint16_t)val);
+	intel_ntb_reg_write(2, regoff, (uint16_t)val);
 }
 
-void
-ntb_db_set_mask(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_set_mask(device_t dev, uint64_t bits)
 {
-
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-		return;
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	DB_MASK_LOCK(ntb);
 	ntb->db_mask |= bits;
-	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	if (!HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
+		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	DB_MASK_UNLOCK(ntb);
 }
 
-void
-ntb_db_clear_mask(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_clear_mask(device_t dev, uint64_t bits)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
+	uint64_t ibits;
+	int i;
 
 	KASSERT((bits & ~ntb->db_valid_mask) == 0,
 	    ("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-		return;
-
 	DB_MASK_LOCK(ntb);
+	ibits = ntb->fake_db_bell & ntb->db_mask & bits;
 	ntb->db_mask &= ~bits;
-	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		/* Simulate fake interrupts if unmasked DB bits are set. */
+		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+			if ((ibits & intel_ntb_db_vector_mask(dev, i)) != 0)
+				swi_sched(ntb->int_info[i].tag, 0);
+		}
+	} else {
+		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	}
 	DB_MASK_UNLOCK(ntb);
 }
 
-uint64_t
-ntb_db_read(struct ntb_softc *ntb)
+static uint64_t
+intel_ntb_db_read(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
-		uint64_t res;
-		unsigned i;
-
-		res = 0;
-		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-			if (ntb->msix_vec[i].masked != 0)
-				res |= ntb_db_vector_mask(ntb, i);
-		}
-		return (res);
-	}
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
+		return (ntb->fake_db_bell);
 
 	return (db_ioread(ntb, ntb->self_reg->db_bell));
 }
 
-void
-ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_clear(device_t dev, uint64_t bits)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	KASSERT((bits & ~ntb->db_valid_mask) == 0,
 	    ("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
-		unsigned i;
-
-		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-			if ((bits & ntb_db_vector_mask(ntb, i)) != 0) {
-				DB_MASK_LOCK(ntb);
-				if (ntb->msix_vec[i].masked != 0) {
-					/* XXX These need a public API. */
-#if 0
-					pci_unmask_msix(ntb->device, i);
-#endif
-					ntb->msix_vec[i].masked = 0;
-				}
-				DB_MASK_UNLOCK(ntb);
-			}
-		}
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		DB_MASK_LOCK(ntb);
+		ntb->fake_db_bell &= ~bits;
+		DB_MASK_UNLOCK(ntb);
 		return;
 	}
 
@@ -1276,43 +1267,59 @@ ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
 }
 
 static inline uint64_t
-ntb_vec_mask(struct ntb_softc *ntb, uint64_t db_vector)
+intel_ntb_vec_mask(struct ntb_softc *ntb, uint64_t db_vector)
 {
 	uint64_t shift, mask;
 
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		/*
+		 * Remap vectors in custom way to make at least first
+		 * three doorbells to not generate stray events.
+		 * This breaks Linux compatibility (if one existed)
+		 * when more then one DB is used (not by if_ntb).
+		 */
+		if (db_vector < XEON_NONLINK_DB_MSIX_BITS - 1)
+			return (1 << db_vector);
+		if (db_vector == XEON_NONLINK_DB_MSIX_BITS - 1)
+			return (0x7ffc);
+	}
+
 	shift = ntb->db_vec_shift;
 	mask = (1ull << shift) - 1;
 	return (mask << (shift * db_vector));
 }
 
 static void
-ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
+intel_ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
 {
 	uint64_t vec_mask;
 
 	ntb->last_ts = ticks;
-	vec_mask = ntb_vec_mask(ntb, vec);
+	vec_mask = intel_ntb_vec_mask(ntb, vec);
 
 	if ((vec_mask & ntb->db_link_mask) != 0) {
-		if (ntb_poll_link(ntb))
-			ntb_link_event(ntb);
+		if (intel_ntb_poll_link(ntb))
+			ntb_link_event(ntb->device);
 	}
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
 	    (vec_mask & ntb->db_link_mask) == 0) {
 		DB_MASK_LOCK(ntb);
-		if (ntb->msix_vec[vec].masked == 0) {
-			/* XXX These need a public API. */
-#if 0
-			pci_mask_msix(ntb->device, vec);
-#endif
-			ntb->msix_vec[vec].masked = 1;
-		}
+
+		/* Do not report same DB events again if not cleared yet. */
+		vec_mask &= ~ntb->fake_db_bell;
+
+		/* Update our internal doorbell register. */
+		ntb->fake_db_bell |= vec_mask;
+
+		/* Do not report masked DB events. */
+		vec_mask &= ~ntb->db_mask;
+
 		DB_MASK_UNLOCK(ntb);
 	}
 
 	if ((vec_mask & ntb->db_valid_mask) != 0)
-		ntb_db_event(ntb, vec);
+		ntb_db_event(ntb->device, vec);
 }
 
 static void
@@ -1320,18 +1327,18 @@ ndev_vec_isr(void *arg)
 {
 	struct ntb_vec *nvec = arg;
 
-	ntb_interrupt(nvec->ntb, nvec->num);
+	intel_ntb_interrupt(nvec->ntb, nvec->num);
 }
 
 static void
 ndev_irq_isr(void *arg)
 {
 	/* If we couldn't set up MSI-X, we only have the one vector. */
-	ntb_interrupt(arg, 0);
+	intel_ntb_interrupt(arg, 0);
 }
 
 static int
-ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
+intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
 {
 	uint32_t i;
 
@@ -1346,7 +1353,7 @@ ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
 }
 
 static void
-ntb_free_msix_vec(struct ntb_softc *ntb)
+intel_ntb_free_msix_vec(struct ntb_softc *ntb)
 {
 
 	if (ntb->msix_vec == NULL)
@@ -1357,7 +1364,7 @@ ntb_free_msix_vec(struct ntb_softc *ntb)
 }
 
 static void
-ntb_get_msix_info(struct ntb_softc *ntb)
+intel_ntb_get_msix_info(struct ntb_softc *ntb)
 {
 	struct pci_devinfo *dinfo;
 	struct pcicfg_msix *msix;
@@ -1366,8 +1373,6 @@ ntb_get_msix_info(struct ntb_softc *ntb)
 	dinfo = device_get_ivars(ntb->device);
 	msix = &dinfo->cfg.msix;
 
-	laddr = data = 0;
-
 	CTASSERT(XEON_NONLINK_DB_MSIX_BITS == nitems(ntb->msix_data));
 
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
@@ -1375,7 +1380,7 @@ ntb_get_msix_info(struct ntb_softc *ntb)
 
 		laddr = bus_read_4(msix->msix_table_res, offset +
 		    PCI_MSIX_ENTRY_LOWER_ADDR);
-		ntb_printf(2, "local MSIX addr(%u): 0x%x\n", i, laddr);
+		intel_ntb_printf(2, "local MSIX addr(%u): 0x%x\n", i, laddr);
 
 		KASSERT((laddr & MSI_INTEL_ADDR_BASE) == MSI_INTEL_ADDR_BASE,
 		    ("local MSIX addr 0x%x not in MSI base 0x%x", laddr,
@@ -1384,14 +1389,14 @@ ntb_get_msix_info(struct ntb_softc *ntb)
 
 		data = bus_read_4(msix->msix_table_res, offset +
 		    PCI_MSIX_ENTRY_DATA);
-		ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
+		intel_ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
 
 		ntb->msix_data[i].nmd_data = data;
 	}
 }
 
 static struct ntb_hw_info *
-ntb_get_device_info(uint32_t device_id)
+intel_ntb_get_device_info(uint32_t device_id)
 {
 	struct ntb_hw_info *ep = pci_ids;
 
@@ -1404,15 +1409,15 @@ ntb_get_device_info(uint32_t device_id)
 }
 
 static void
-ntb_teardown_xeon(struct ntb_softc *ntb)
+intel_ntb_teardown_xeon(struct ntb_softc *ntb)
 {
 
 	if (ntb->reg != NULL)
-		ntb_link_disable(ntb);
+		intel_ntb_link_disable(ntb->device);
 }
 
 static void
-ntb_detect_max_mw(struct ntb_softc *ntb)
+intel_ntb_detect_max_mw(struct ntb_softc *ntb)
 {
 
 	if (ntb->type == NTB_ATOM) {
@@ -1420,14 +1425,14 @@ ntb_detect_max_mw(struct ntb_softc *ntb)
 		return;
 	}
 
-	if (HAS_FEATURE(NTB_SPLIT_BAR))
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		ntb->mw_count = XEON_HSX_SPLIT_MW_COUNT;
 	else
 		ntb->mw_count = XEON_SNB_MW_COUNT;
 }
 
 static int
-ntb_detect_xeon(struct ntb_softc *ntb)
+intel_ntb_detect_xeon(struct ntb_softc *ntb)
 {
 	uint8_t ppd, conn_type;
 
@@ -1442,11 +1447,21 @@ ntb_detect_xeon(struct ntb_softc *ntb)
 	if ((ppd & XEON_PPD_SPLIT_BAR) != 0)
 		ntb->features |= NTB_SPLIT_BAR;
 
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
+	    !HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
+		device_printf(ntb->device,
+		    "Can not apply SB01BASE_LOCKUP workaround "
+		    "with split BARs disabled!\n");
+		device_printf(ntb->device,
+		    "Expect system hangs under heavy NTB traffic!\n");
+		ntb->features &= ~NTB_SB01BASE_LOCKUP;
+	}
+
 	/*
 	 * SDOORBELL errata workaround gets in the way of SB01BASE_LOCKUP
 	 * errata workaround; only do one at a time.
 	 */
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
 		ntb->features &= ~NTB_SDOORBELL_LOCKUP;
 
 	conn_type = ppd & XEON_PPD_CONN_TYPE;
@@ -1465,7 +1480,7 @@ ntb_detect_xeon(struct ntb_softc *ntb)
 }
 
 static int
-ntb_detect_atom(struct ntb_softc *ntb)
+intel_ntb_detect_atom(struct ntb_softc *ntb)
 {
 	uint32_t ppd, conn_type;
 
@@ -1490,7 +1505,7 @@ ntb_detect_atom(struct ntb_softc *ntb)
 }
 
 static int
-ntb_xeon_init_dev(struct ntb_softc *ntb)
+intel_ntb_xeon_init_dev(struct ntb_softc *ntb)
 {
 	int rc;
 
@@ -1511,15 +1526,16 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 	ntb->peer_reg = &xeon_b2b_reg;
 	ntb->xlat_reg = &xeon_sec_xlat;
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		ntb->fake_db_bell = 0;
 		ntb->msix_mw_idx = (ntb->mw_count + g_ntb_msix_idx) %
 		    ntb->mw_count;
-		ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
+		intel_ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
 		    g_ntb_msix_idx, ntb->msix_mw_idx);
-		rc = ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
+		rc = intel_ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
 		    VM_MEMATTR_UNCACHEABLE);
 		KASSERT(rc == 0, ("shouldn't fail"));
-	} else if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+	} else if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
 		/*
 		 * There is a Xeon hardware errata related to writes to SDOORBELL or
 		 * B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
@@ -1529,12 +1545,12 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 		 */
 		ntb->b2b_mw_idx = (ntb->mw_count + g_ntb_mw_idx) %
 		    ntb->mw_count;
-		ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
+		intel_ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
 		    g_ntb_mw_idx, ntb->b2b_mw_idx);
-		rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
+		rc = intel_ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
 		    VM_MEMATTR_UNCACHEABLE);
 		KASSERT(rc == 0, ("shouldn't fail"));
-	} else if (HAS_FEATURE(NTB_B2BDOORBELL_BIT14))
+	} else if (HAS_FEATURE(ntb, NTB_B2BDOORBELL_BIT14))
 		/*
 		 * HW Errata on bit 14 of b2bdoorbell register.  Writes will not be
 		 * mirrored to the remote system.  Shrink the number of bits by one,
@@ -1557,7 +1573,7 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 		return (rc);
 
 	/* Enable Bus Master and Memory Space on the secondary side */
-	ntb_reg_write(2, XEON_SPCICMD_OFFSET,
+	intel_ntb_reg_write(2, XEON_SPCICMD_OFFSET,
 	    PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
 	/*
@@ -1568,16 +1584,12 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	DB_MASK_UNLOCK(ntb);
 
-	rc = xeon_setup_msix_bar(ntb);
-	if (rc != 0)
-		return (rc);
-
-	rc = ntb_init_isr(ntb);
+	rc = intel_ntb_init_isr(ntb);
 	return (rc);
 }
 
 static int
-ntb_atom_init_dev(struct ntb_softc *ntb)
+intel_ntb_atom_init_dev(struct ntb_softc *ntb)
 {
 	int error;
 
@@ -1604,15 +1616,15 @@ ntb_atom_init_dev(struct ntb_softc *ntb)
 	configure_atom_secondary_side_bars(ntb);
 
 	/* Enable Bus Master and Memory Space on the secondary side */
-	ntb_reg_write(2, ATOM_SPCICMD_OFFSET,
+	intel_ntb_reg_write(2, ATOM_SPCICMD_OFFSET,
 	    PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
-	error = ntb_init_isr(ntb);
+	error = intel_ntb_init_isr(ntb);
 	if (error != 0)
 		return (error);
 
 	/* Initiate PCI-E link training */
-	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
 
 	callout_reset(&ntb->heartbeat_timer, 0, atom_link_hb, ntb);
 
@@ -1625,19 +1637,19 @@ configure_atom_secondary_side_bars(struct ntb_softc *ntb)
 {
 
 	if (ntb->dev_type == NTB_DEV_USD) {
-		ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
 		    XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
 		    XEON_B2B_BAR4_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
 	} else {
-		ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
 		    XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
+		intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
 		    XEON_B2B_BAR4_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
-		ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
+		intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
 	}
 }
 
@@ -1664,7 +1676,7 @@ xeon_reset_sbar_size(struct ntb_softc *ntb, enum ntb_bar idx,
 	struct ntb_pci_bar_info *bar;
 	uint8_t bar_sz;
 
-	if (!HAS_FEATURE(NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_3)
+	if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_3)
 		return;
 
 	bar = &ntb->bar_info[idx];
@@ -1688,28 +1700,28 @@ xeon_set_sbar_base_and_limit(struct ntb_softc *ntb, uint64_t bar_addr,
 	uint32_t base_reg, lmt_reg;
 
 	bar_get_xlat_params(ntb, idx, &base_reg, NULL, &lmt_reg);
-	if (idx == regbar)
-		bar_addr += ntb->b2b_off;
+	if (idx == regbar) {
+		if (ntb->b2b_off)
+			bar_addr += ntb->b2b_off;
+		else
+			bar_addr = 0;
+	}
 
-	/*
-	 * Set limit registers first to avoid an errata where setting the base
-	 * registers locks the limit registers.
-	 */
 	if (!bar_is_64bit(ntb, idx)) {
-		ntb_reg_write(4, lmt_reg, bar_addr);
-		reg_val = ntb_reg_read(4, lmt_reg);
+		intel_ntb_reg_write(4, base_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(4, base_reg);
 		(void)reg_val;
 
-		ntb_reg_write(4, base_reg, bar_addr);
-		reg_val = ntb_reg_read(4, base_reg);
+		intel_ntb_reg_write(4, lmt_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(4, lmt_reg);
 		(void)reg_val;
 	} else {
-		ntb_reg_write(8, lmt_reg, bar_addr);
-		reg_val = ntb_reg_read(8, lmt_reg);
+		intel_ntb_reg_write(8, base_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(8, base_reg);
 		(void)reg_val;
 
-		ntb_reg_write(8, base_reg, bar_addr);
-		reg_val = ntb_reg_read(8, base_reg);
+		intel_ntb_reg_write(8, lmt_reg, bar_addr);
+		reg_val = intel_ntb_reg_read(8, lmt_reg);
 		(void)reg_val;
 	}
 }
@@ -1720,30 +1732,17 @@ xeon_set_pbar_xlat(struct ntb_softc *ntb, uint64_t base_addr, enum ntb_bar idx)
 	struct ntb_pci_bar_info *bar;
 
 	bar = &ntb->bar_info[idx];
-	if (HAS_FEATURE(NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_2) {
-		ntb_reg_write(4, bar->pbarxlat_off, base_addr);
-		base_addr = ntb_reg_read(4, bar->pbarxlat_off);
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_2) {
+		intel_ntb_reg_write(4, bar->pbarxlat_off, base_addr);
+		base_addr = intel_ntb_reg_read(4, bar->pbarxlat_off);
 	} else {
-		ntb_reg_write(8, bar->pbarxlat_off, base_addr);
-		base_addr = ntb_reg_read(8, bar->pbarxlat_off);
+		intel_ntb_reg_write(8, bar->pbarxlat_off, base_addr);
+		base_addr = intel_ntb_reg_read(8, bar->pbarxlat_off);
 	}
 	(void)base_addr;
 }
 
 static int
-xeon_setup_msix_bar(struct ntb_softc *ntb)
-{
-	enum ntb_bar bar_num;
-
-	if (!HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-		return (0);
-
-	bar_num = ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
-	ntb->peer_lapic_bar =  &ntb->bar_info[bar_num];
-	return (0);
-}
-
-static int
 xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
     const struct ntb_b2b_addr *peer_addr)
 {
@@ -1757,7 +1756,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		b2b_bar_num = NTB_CONFIG_BAR;
 		ntb->b2b_off = 0;
 	} else {
-		b2b_bar_num = ntb_mw_to_bar(ntb, ntb->b2b_mw_idx);
+		b2b_bar_num = intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx);
 		KASSERT(b2b_bar_num > 0 && b2b_bar_num < NTB_MAX_BARS,
 		    ("invalid b2b mw bar"));
 
@@ -1788,7 +1787,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		bar_addr = addr->bar0_addr;
 	else if (b2b_bar_num == NTB_B2B_BAR_1)
 		bar_addr = addr->bar2_addr64;
-	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(NTB_SPLIT_BAR))
+	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		bar_addr = addr->bar4_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2)
 		bar_addr = addr->bar4_addr32;
@@ -1797,7 +1796,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 	else
 		KASSERT(false, ("invalid bar"));
 
-	ntb_reg_write(8, XEON_SBAR0BASE_OFFSET, bar_addr);
+	intel_ntb_reg_write(8, XEON_SBAR0BASE_OFFSET, bar_addr);
 
 	/*
 	 * Other SBARs are normally hit by the PBAR xlat, except for the b2b
@@ -1808,7 +1807,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 	 */
 	xeon_set_sbar_base_and_limit(ntb, addr->bar2_addr64, NTB_B2B_BAR_1,
 	    b2b_bar_num);
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		xeon_set_sbar_base_and_limit(ntb, addr->bar4_addr32,
 		    NTB_B2B_BAR_2, b2b_bar_num);
 		xeon_set_sbar_base_and_limit(ntb, addr->bar5_addr32,
@@ -1818,56 +1817,41 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		    NTB_B2B_BAR_2, b2b_bar_num);
 
 	/* Zero incoming translation addrs */
-	ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
-	ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
-
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
-		size_t size, xlatoffset;
+	intel_ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
+	intel_ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
 
-		switch (ntb_mw_to_bar(ntb, ntb->msix_mw_idx)) {
-		case NTB_B2B_BAR_1:
-			size = 8;
-			xlatoffset = XEON_SBAR2XLAT_OFFSET;
-			break;
-		case NTB_B2B_BAR_2:
-			xlatoffset = XEON_SBAR4XLAT_OFFSET;
-			if (HAS_FEATURE(NTB_SPLIT_BAR))
-				size = 4;
-			else
-				size = 8;
-			break;
-		case NTB_B2B_BAR_3:
-			xlatoffset = XEON_SBAR5XLAT_OFFSET;
-			size = 4;
-			break;
-		default:
-			KASSERT(false, ("Bogus msix mw idx: %u",
-			    ntb->msix_mw_idx));
-			return (EINVAL);
-		}
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+		uint32_t xlat_reg, lmt_reg;
+		enum ntb_bar bar_num;
 
 		/*
 		 * We point the chosen MSIX MW BAR xlat to remote LAPIC for
 		 * workaround
 		 */
-		if (size == 4) {
-			ntb_reg_write(4, xlatoffset, MSI_INTEL_ADDR_BASE);
-			ntb->msix_xlat = ntb_reg_read(4, xlatoffset);
+		bar_num = intel_ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
+		bar_get_xlat_params(ntb, bar_num, NULL, &xlat_reg, &lmt_reg);
+		if (bar_is_64bit(ntb, bar_num)) {
+			intel_ntb_reg_write(8, xlat_reg, MSI_INTEL_ADDR_BASE);
+			ntb->msix_xlat = intel_ntb_reg_read(8, xlat_reg);
+			intel_ntb_reg_write(8, lmt_reg, 0);
 		} else {
-			ntb_reg_write(8, xlatoffset, MSI_INTEL_ADDR_BASE);
-			ntb->msix_xlat = ntb_reg_read(8, xlatoffset);
+			intel_ntb_reg_write(4, xlat_reg, MSI_INTEL_ADDR_BASE);
+			ntb->msix_xlat = intel_ntb_reg_read(4, xlat_reg);
+			intel_ntb_reg_write(4, lmt_reg, 0);
 		}
+
+		ntb->peer_lapic_bar =  &ntb->bar_info[bar_num];
 	}
-	(void)ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
-	(void)ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
+	(void)intel_ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
+	(void)intel_ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
 
 	/* Zero outgoing translation limits (whole bar size windows) */
-	ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
-	ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
+	intel_ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
+	intel_ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
 
 	/* Set outgoing translation offsets */
 	xeon_set_pbar_xlat(ntb, peer_addr->bar2_addr64, NTB_B2B_BAR_1);
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		xeon_set_pbar_xlat(ntb, peer_addr->bar4_addr32, NTB_B2B_BAR_2);
 		xeon_set_pbar_xlat(ntb, peer_addr->bar5_addr32, NTB_B2B_BAR_3);
 	} else
@@ -1879,7 +1863,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 		bar_addr = peer_addr->bar0_addr;
 	else if (b2b_bar_num == NTB_B2B_BAR_1)
 		bar_addr = peer_addr->bar2_addr64;
-	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(NTB_SPLIT_BAR))
+	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		bar_addr = peer_addr->bar4_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2)
 		bar_addr = peer_addr->bar4_addr32;
@@ -1892,8 +1876,8 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 	 * B2B_XLAT_OFFSET is a 64-bit register but can only be written 32 bits
 	 * at a time.
 	 */
-	ntb_reg_write(4, XEON_B2B_XLAT_OFFSETL, bar_addr & 0xffffffff);
-	ntb_reg_write(4, XEON_B2B_XLAT_OFFSETU, bar_addr >> 32);
+	intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETL, bar_addr & 0xffffffff);
+	intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETU, bar_addr >> 32);
 	return (0);
 }
 
@@ -1912,7 +1896,7 @@ link_is_up(struct ntb_softc *ntb)
 
 	if (ntb->type == NTB_XEON)
 		return (_xeon_link_is_up(ntb) && (ntb->peer_msix_good ||
-		    !HAS_FEATURE(NTB_SB01BASE_LOCKUP)));
+		    !HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)));
 
 	KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
 	return ((ntb->ntb_ctl & ATOM_CNTL_LINK_DOWN) == 0);
@@ -1925,11 +1909,11 @@ atom_link_is_err(struct ntb_softc *ntb)
 
 	KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
 
-	status = ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
 	if ((status & ATOM_LTSSMSTATEJMP_FORCEDETECT) != 0)
 		return (true);
 
-	status = ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
 	return ((status & ATOM_IBIST_ERR_OFLOW) != 0);
 }
 
@@ -1952,8 +1936,8 @@ atom_link_hb(void *arg)
 		goto out;
 	}
 
-	if (ntb_poll_link(ntb))
-		ntb_link_event(ntb);
+	if (intel_ntb_poll_link(ntb))
+		ntb_link_event(ntb->device);
 
 	if (!link_is_up(ntb) && atom_link_is_err(ntb)) {
 		/* Link is down with error, proceed with recovery */
@@ -1971,166 +1955,47 @@ atom_perform_link_restart(struct ntb_softc *ntb)
 	uint32_t status;
 
 	/* Driver resets the NTB ModPhy lanes - magic! */
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0xe0);
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x40);
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x60);
-	ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0x60);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0xe0);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x40);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x60);
+	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0x60);
 
 	/* Driver waits 100ms to allow the NTB ModPhy to settle */
 	pause("ModPhy", hz / 10);
 
 	/* Clear AER Errors, write to clear */
-	status = ntb_reg_read(4, ATOM_ERRCORSTS_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_ERRCORSTS_OFFSET);
 	status &= PCIM_AER_COR_REPLAY_ROLLOVER;
-	ntb_reg_write(4, ATOM_ERRCORSTS_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_ERRCORSTS_OFFSET, status);
 
 	/* Clear unexpected electrical idle event in LTSSM, write to clear */
-	status = ntb_reg_read(4, ATOM_LTSSMERRSTS0_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_LTSSMERRSTS0_OFFSET);
 	status |= ATOM_LTSSMERRSTS0_UNEXPECTEDEI;
-	ntb_reg_write(4, ATOM_LTSSMERRSTS0_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_LTSSMERRSTS0_OFFSET, status);
 
 	/* Clear DeSkew Buffer error, write to clear */
-	status = ntb_reg_read(4, ATOM_DESKEWSTS_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_DESKEWSTS_OFFSET);
 	status |= ATOM_DESKEWSTS_DBERR;
-	ntb_reg_write(4, ATOM_DESKEWSTS_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_DESKEWSTS_OFFSET, status);
 
-	status = ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
 	status &= ATOM_IBIST_ERR_OFLOW;
-	ntb_reg_write(4, ATOM_IBSTERRRCRVSTS0_OFFSET, status);
+	intel_ntb_reg_write(4, ATOM_IBSTERRRCRVSTS0_OFFSET, status);
 
 	/* Releases the NTB state machine to allow the link to retrain */
-	status = ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
+	status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
 	status &= ~ATOM_LTSSMSTATEJMP_FORCEDETECT;
-	ntb_reg_write(4, ATOM_LTSSMSTATEJMP_OFFSET, status);
-}
-
-/*
- * ntb_set_ctx() - associate a driver context with an ntb device
- * @ntb:        NTB device context
- * @ctx:        Driver context
- * @ctx_ops:    Driver context operations
- *
- * Associate a driver context and operations with a ntb device.  The context is
- * provided by the client driver, and the driver may associate a different
- * context with each ntb device.
- *
- * Return: Zero if the context is associated, otherwise an error number.
- */
-int
-ntb_set_ctx(struct ntb_softc *ntb, void *ctx, const struct ntb_ctx_ops *ops)
-{
-
-	if (ctx == NULL || ops == NULL)
-		return (EINVAL);
-	if (ntb->ctx_ops != NULL)
-		return (EINVAL);
-
-	CTX_LOCK(ntb);
-	if (ntb->ctx_ops != NULL) {
-		CTX_UNLOCK(ntb);
-		return (EINVAL);
-	}
-	ntb->ntb_ctx = ctx;
-	ntb->ctx_ops = ops;
-	CTX_UNLOCK(ntb);
-
-	return (0);
-}
-
-/*
- * It is expected that this will only be used from contexts where the ctx_lock
- * is not needed to protect ntb_ctx lifetime.
- */
-void *
-ntb_get_ctx(struct ntb_softc *ntb, const struct ntb_ctx_ops **ops)
-{
-
-	KASSERT(ntb->ntb_ctx != NULL && ntb->ctx_ops != NULL, ("bogus"));
-	if (ops != NULL)
-		*ops = ntb->ctx_ops;
-	return (ntb->ntb_ctx);
-}
-
-/*
- * ntb_clear_ctx() - disassociate any driver context from an ntb device
- * @ntb:        NTB device context
- *
- * Clear any association that may exist between a driver context and the ntb
- * device.
- */
-void
-ntb_clear_ctx(struct ntb_softc *ntb)
-{
-
-	CTX_LOCK(ntb);
-	ntb->ntb_ctx = NULL;
-	ntb->ctx_ops = NULL;
-	CTX_UNLOCK(ntb);
-}
-
-/*
- * ntb_link_event() - notify driver context of a change in link status
- * @ntb:        NTB device context
- *
- * Notify the driver context that the link status may have changed.  The driver
- * should call ntb_link_is_up() to get the current status.
- */
-void
-ntb_link_event(struct ntb_softc *ntb)
-{
-
-	CTX_LOCK(ntb);
-	if (ntb->ctx_ops != NULL && ntb->ctx_ops->link_event != NULL)
-		ntb->ctx_ops->link_event(ntb->ntb_ctx);
-	CTX_UNLOCK(ntb);
+	intel_ntb_reg_write(4, ATOM_LTSSMSTATEJMP_OFFSET, status);
 }
 
-/*
- * ntb_db_event() - notify driver context of a doorbell event
- * @ntb:        NTB device context
- * @vector:     Interrupt vector number
- *
- * Notify the driver context of a doorbell event.  If hardware supports
- * multiple interrupt vectors for doorbells, the vector number indicates which
- * vector received the interrupt.  The vector number is relative to the first
- * vector used for doorbells, starting at zero, and must be less than
- * ntb_db_vector_count().  The driver may call ntb_db_read() to check which
- * doorbell bits need service, and ntb_db_vector_mask() to determine which of
- * those bits are associated with the vector number.
- */
-static void
-ntb_db_event(struct ntb_softc *ntb, uint32_t vec)
-{
-
-	CTX_LOCK(ntb);
-	if (ntb->ctx_ops != NULL && ntb->ctx_ops->db_event != NULL)
-		ntb->ctx_ops->db_event(ntb->ntb_ctx, vec);
-	CTX_UNLOCK(ntb);
-}
-
-/*
- * ntb_link_enable() - enable the link on the secondary side of the ntb
- * @ntb:        NTB device context
- * @max_speed:  The maximum link speed expressed as PCIe generation number[0]
- * @max_width:  The maximum link width expressed as the number of PCIe lanes[0]
- *
- * Enable the link on the secondary side of the ntb.  This can only be done
- * from the primary side of the ntb in primary or b2b topology.  The ntb device
- * should train the link to its maximum speed and width, or the requested speed
- * and width, whichever is smaller, if supported.
- *
- * Return: Zero on success, otherwise an error number.
- *
- * [0]: Only NTB_SPEED_AUTO and NTB_WIDTH_AUTO are valid inputs; other speed
- *      and width input will be ignored.
- */
-int
-ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
-    enum ntb_width w __unused)
+static int
+intel_ntb_link_enable(device_t dev, enum ntb_speed speed __unused,
+    enum ntb_width width __unused)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
-	ntb_printf(2, "%s\n", __func__);
+	intel_ntb_printf(2, "%s\n", __func__);
 
 	if (ntb->type == NTB_ATOM) {
 		pci_write_config(ntb->device, NTB_PPD_OFFSET,
@@ -2139,57 +2004,47 @@ ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
 	}
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ntb);
+		ntb_link_event(dev);
 		return (0);
 	}
 
-	cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
 	cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
 	cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
-	if (HAS_FEATURE(NTB_SPLIT_BAR))
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		cntl |= NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP;
-	ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
+	intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
 	return (0);
 }
 
-/*
- * ntb_link_disable() - disable the link on the secondary side of the ntb
- * @ntb:        NTB device context
- *
- * Disable the link on the secondary side of the ntb.  This can only be done
- * from the primary side of the ntb in primary or b2b topology.  The ntb device
- * should disable the link.  Returning from this call must indicate that a
- * barrier has passed, though with no more writes may pass in either direction
- * across the link, except if this call returns an error number.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_link_disable(struct ntb_softc *ntb)
+static int
+intel_ntb_link_disable(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
-	ntb_printf(2, "%s\n", __func__);
+	intel_ntb_printf(2, "%s\n", __func__);
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ntb);
+		ntb_link_event(dev);
 		return (0);
 	}
 
-	cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
 	cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
-	if (HAS_FEATURE(NTB_SPLIT_BAR))
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP);
 	cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
-	ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
+	intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
 	return (0);
 }
 
-bool
-ntb_link_enabled(struct ntb_softc *ntb)
+static bool
+intel_ntb_link_enabled(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
 	if (ntb->type == NTB_ATOM) {
@@ -2200,7 +2055,7 @@ ntb_link_enabled(struct ntb_softc *ntb)
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT)
 		return (true);
 
-	cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	return ((cntl & NTB_CNTL_LINK_DISABLE) == 0);
 }
 
@@ -2225,11 +2080,11 @@ recover_atom_link(void *arg)
 	if (atom_link_is_err(ntb))
 		goto retry;
 
-	status32 = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	status32 = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	if ((status32 & ATOM_CNTL_LINK_DOWN) != 0)
 		goto out;
 
-	status32 = ntb_reg_read(4, ntb->reg->lnk_sta);
+	status32 = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
 	width = NTB_LNK_STA_WIDTH(status32);
 	speed = status32 & NTB_LINK_SPEED_MASK;
 
@@ -2252,18 +2107,18 @@ retry:
  * Polls the HW link status register(s); returns true if something has changed.
  */
 static bool
-ntb_poll_link(struct ntb_softc *ntb)
+intel_ntb_poll_link(struct ntb_softc *ntb)
 {
 	uint32_t ntb_cntl;
 	uint16_t reg_val;
 
 	if (ntb->type == NTB_ATOM) {
-		ntb_cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+		ntb_cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 		if (ntb_cntl == ntb->ntb_ctl)
 			return (false);
 
 		ntb->ntb_ctl = ntb_cntl;
-		ntb->lnk_sta = ntb_reg_read(4, ntb->reg->lnk_sta);
+		ntb->lnk_sta = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
 	} else {
 		db_iowrite_raw(ntb, ntb->self_reg->db_bell, ntb->db_link_mask);
 
@@ -2273,11 +2128,11 @@ ntb_poll_link(struct ntb_softc *ntb)
 
 		ntb->lnk_sta = reg_val;
 
-		if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			if (_xeon_link_is_up(ntb)) {
 				if (!ntb->peer_msix_good) {
 					callout_reset(&ntb->peer_msix_work, 0,
-					    ntb_exchange_msix, ntb);
+					    intel_ntb_exchange_msix, ntb);
 					return (false);
 				}
 			} else {
@@ -2290,7 +2145,7 @@ ntb_poll_link(struct ntb_softc *ntb)
 }
 
 static inline enum ntb_speed
-ntb_link_sta_speed(struct ntb_softc *ntb)
+intel_ntb_link_sta_speed(struct ntb_softc *ntb)
 {
 
 	if (!link_is_up(ntb))
@@ -2299,7 +2154,7 @@ ntb_link_sta_speed(struct ntb_softc *ntb)
 }
 
 static inline enum ntb_width
-ntb_link_sta_width(struct ntb_softc *ntb)
+intel_ntb_link_sta_width(struct ntb_softc *ntb)
 {
 
 	if (!link_is_up(ntb))
@@ -2321,7 +2176,7 @@ SYSCTL_NODE(_hw_ntb, OID_AUTO, debug_info, CTLFLAG_RW, 0,
 #define NTB_REGFLAGS_MASK	(NTB_REGSZ_MASK | NTB_DB_READ | NTB_PCI_REG)
 
 static void
-ntb_sysctl_init(struct ntb_softc *ntb)
+intel_ntb_sysctl_init(struct ntb_softc *ntb)
 {
 	struct sysctl_oid_list *globals, *tree_par, *regpar, *statpar, *errpar;
 	struct sysctl_ctx_list *ctx;
@@ -2424,7 +2279,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_xlat,
 	    sysctl_handle_register, "QU", "Incoming XLAT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_xlat4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_xlat,
@@ -2444,7 +2299,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_limit,
 	    sysctl_handle_register, "QU", "Incoming LMT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_lmt4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_limit,
@@ -2535,7 +2390,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->bar_info[NTB_B2B_BAR_1].pbarxlat_off,
 	    sysctl_handle_register, "QU", "Outgoing XLAT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_xlat4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off,
@@ -2555,7 +2410,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | XEON_PBAR2LMT_OFFSET,
 	    sysctl_handle_register, "QU", "Outgoing LMT23 register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_lmt4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | XEON_PBAR4LMT_OFFSET,
@@ -2579,7 +2434,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_base,
 	    sysctl_handle_register, "QU", "Secondary BAR23 base register");
-	if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar4_base",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_base,
@@ -2602,13 +2457,10 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 static int
 sysctl_handle_features(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	struct sbuf sb;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
 	sbuf_new_for_sysctl(&sb, NULL, 256, req);
 
 	sbuf_printf(&sb, "%b", ntb->features, NTB_FEATURES_STR);
@@ -2623,14 +2475,11 @@ sysctl_handle_features(SYSCTL_HANDLER_ARGS)
 static int
 sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	unsigned old, new;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
-	old = ntb_link_enabled(ntb);
+	old = intel_ntb_link_enabled(ntb->device);
 
 	error = SYSCTL_OUT(req, &old, sizeof(old));
 	if (error != 0 || req->newptr == NULL)
@@ -2640,31 +2489,28 @@ sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
 	if (error != 0)
 		return (error);
 
-	ntb_printf(0, "Admin set interface state to '%sabled'\n",
+	intel_ntb_printf(0, "Admin set interface state to '%sabled'\n",
 	    (new != 0)? "en" : "dis");
 
 	if (new != 0)
-		error = ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+		error = intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
 	else
-		error = ntb_link_disable(ntb);
+		error = intel_ntb_link_disable(ntb->device);
 	return (error);
 }
 
 static int
 sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	struct sbuf sb;
 	enum ntb_speed speed;
 	enum ntb_width width;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
 	sbuf_new_for_sysctl(&sb, NULL, 32, req);
 
-	if (ntb_link_is_up(ntb, &speed, &width))
+	if (intel_ntb_link_is_up(ntb->device, &speed, &width))
 		sbuf_printf(&sb, "up / PCIe Gen %u / Width x%u",
 		    (unsigned)speed, (unsigned)width);
 	else
@@ -2681,14 +2527,11 @@ sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
 static int
 sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
 {
-	struct ntb_softc *ntb;
+	struct ntb_softc *ntb = arg1;
 	unsigned res;
 	int error;
 
-	error = 0;
-	ntb = arg1;
-
-	res = ntb_link_is_up(ntb, NULL, NULL);
+	res = intel_ntb_link_is_up(ntb->device, NULL, NULL);
 
 	error = SYSCTL_OUT(req, &res, sizeof(res));
 	if (error || !req->newptr)
@@ -2727,28 +2570,28 @@ sysctl_handle_register(SYSCTL_HANDLER_ARGS)
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 8);
 			else
-				umv = ntb_reg_read(8, reg);
+				umv = intel_ntb_reg_read(8, reg);
 			outsz = sizeof(uint64_t);
 			break;
 		case NTB_REG_32:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 4);
 			else
-				umv = ntb_reg_read(4, reg);
+				umv = intel_ntb_reg_read(4, reg);
 			outsz = sizeof(uint32_t);
 			break;
 		case NTB_REG_16:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 2);
 			else
-				umv = ntb_reg_read(2, reg);
+				umv = intel_ntb_reg_read(2, reg);
 			outsz = sizeof(uint16_t);
 			break;
 		case NTB_REG_8:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 1);
 			else
-				umv = ntb_reg_read(1, reg);
+				umv = intel_ntb_reg_read(1, reg);
 			outsz = sizeof(uint8_t);
 			break;
 		default:
@@ -2768,7 +2611,7 @@ sysctl_handle_register(SYSCTL_HANDLER_ARGS)
 }
 
 static unsigned
-ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
+intel_ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
 {
 
 	if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
@@ -2782,8 +2625,21 @@ ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
 	return (uidx);
 }
 
+#ifndef EARLY_AP_STARTUP
+static int msix_ready;
+
+static void
+intel_ntb_msix_ready(void *arg __unused)
+{
+
+	msix_ready = 1;
+}
+SYSINIT(intel_ntb_msix_ready, SI_SUB_SMP, SI_ORDER_ANY,
+    intel_ntb_msix_ready, NULL);
+#endif
+
 static void
-ntb_exchange_msix(void *ctx)
+intel_ntb_exchange_msix(void *ctx)
 {
 	struct ntb_softc *ntb;
 	uint32_t val;
@@ -2796,42 +2652,50 @@ ntb_exchange_msix(void *ctx)
 	if (ntb->peer_msix_done)
 		goto msix_done;
 
+#ifndef EARLY_AP_STARTUP
+	/* Block MSIX negotiation until SMP started and IRQ reshuffled. */
+	if (!msix_ready)
+		goto reschedule;
+#endif
+
+	intel_ntb_get_msix_info(ntb);
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-		ntb_peer_spad_write(ntb, NTB_MSIX_DATA0 + i,
+		intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DATA0 + i,
 		    ntb->msix_data[i].nmd_data);
-		ntb_peer_spad_write(ntb, NTB_MSIX_OFS0 + i,
+		intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_OFS0 + i,
 		    ntb->msix_data[i].nmd_ofs - ntb->msix_xlat);
 	}
-	ntb_peer_spad_write(ntb, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
+	intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
 
-	ntb_spad_read(ntb, NTB_MSIX_GUARD, &val);
+	intel_ntb_spad_read(ntb->device, NTB_MSIX_GUARD, &val);
 	if (val != NTB_MSIX_VER_GUARD)
 		goto reschedule;
 
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-		ntb_spad_read(ntb, NTB_MSIX_DATA0 + i, &val);
-		ntb_printf(2, "remote MSIX data(%u): 0x%x\n", i, val);
+		intel_ntb_spad_read(ntb->device, NTB_MSIX_DATA0 + i, &val);
+		intel_ntb_printf(2, "remote MSIX data(%u): 0x%x\n", i, val);
 		ntb->peer_msix_data[i].nmd_data = val;
-		ntb_spad_read(ntb, NTB_MSIX_OFS0 + i, &val);
-		ntb_printf(2, "remote MSIX addr(%u): 0x%x\n", i, val);
+		intel_ntb_spad_read(ntb->device, NTB_MSIX_OFS0 + i, &val);
+		intel_ntb_printf(2, "remote MSIX addr(%u): 0x%x\n", i, val);
 		ntb->peer_msix_data[i].nmd_ofs = val;
 	}
 
 	ntb->peer_msix_done = true;
 
 msix_done:
-	ntb_peer_spad_write(ntb, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
-	ntb_spad_read(ntb, NTB_MSIX_DONE, &val);
+	intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
+	intel_ntb_spad_read(ntb->device, NTB_MSIX_DONE, &val);
 	if (val != NTB_MSIX_RECEIVED)
 		goto reschedule;
 
+	intel_ntb_spad_clear(ntb->device);
 	ntb->peer_msix_good = true;
 	/* Give peer time to see our NTB_MSIX_RECEIVED. */
 	goto reschedule;
 
 msix_good:
-	ntb_poll_link(ntb);
-	ntb_link_event(ntb);
+	intel_ntb_poll_link(ntb);
+	ntb_link_event(ntb->device);
 	return;
 
 reschedule:
@@ -2839,40 +2703,27 @@ reschedule:
 	if (_xeon_link_is_up(ntb)) {
 		callout_reset(&ntb->peer_msix_work,
 		    hz * (ntb->peer_msix_good ? 2 : 1) / 100,
-		    ntb_exchange_msix, ntb);
+		    intel_ntb_exchange_msix, ntb);
 	} else
-		ntb_spad_clear(ntb);
+		intel_ntb_spad_clear(ntb->device);
 }
 
 /*
  * Public API to the rest of the OS
  */
 
-/**
- * ntb_get_max_spads() - get the total scratch regs usable
- * @ntb: pointer to ntb_softc instance
- *
- * This function returns the max 32bit scratchpad registers usable by the
- * upper layer.
- *
- * RETURNS: total number of scratch pad registers available
- */
-uint8_t
-ntb_get_max_spads(struct ntb_softc *ntb)
+static uint8_t
+intel_ntb_spad_count(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	return (ntb->spad_count);
 }
 
-/*
- * ntb_mw_count() - Get the number of memory windows available for KPI
- * consumers.
- *
- * (Excludes any MW wholly reserved for register access.)
- */
-uint8_t
-ntb_mw_count(struct ntb_softc *ntb)
+static uint8_t
+intel_ntb_mw_count(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	uint8_t res;
 
 	res = ntb->mw_count;
@@ -2883,25 +2734,15 @@ ntb_mw_count(struct ntb_softc *ntb)
 	return (res);
 }
 
-/**
- * ntb_spad_write() - write to the secondary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
+static int
+intel_ntb_spad_write(device_t dev, unsigned int idx, uint32_t val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	ntb_reg_write(4, ntb->self_reg->spad + idx * 4, val);
+	intel_ntb_reg_write(4, ntb->self_reg->spad + idx * 4, val);
 
 	return (0);
 }
@@ -2909,122 +2750,77 @@ ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
 /*
  * Zeros the local scratchpad.
  */
-void
-ntb_spad_clear(struct ntb_softc *ntb)
+static void
+intel_ntb_spad_clear(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	unsigned i;
 
 	for (i = 0; i < ntb->spad_count; i++)
-		ntb_spad_write(ntb, i, 0);
+		intel_ntb_spad_write(dev, i, 0);
 }
 
-/**
- * ntb_spad_read() - read from the primary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val)
+static int
+intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	*val = ntb_reg_read(4, ntb->self_reg->spad + idx * 4);
+	*val = intel_ntb_reg_read(4, ntb->self_reg->spad + idx * 4);
 
 	return (0);
 }
 
-/**
- * ntb_peer_spad_write() - write to the secondary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
+static int
+intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP))
-		ntb_mw_write(4, XEON_SPAD_OFFSET + idx * 4, val);
+	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
+		intel_ntb_mw_write(4, XEON_SPAD_OFFSET + idx * 4, val);
 	else
-		ntb_reg_write(4, ntb->peer_reg->spad + idx * 4, val);
+		intel_ntb_reg_write(4, ntb->peer_reg->spad + idx * 4, val);
 
 	return (0);
 }
 
-/**
- * ntb_peer_spad_read() - read from the primary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_peer_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val)
+static int
+intel_ntb_peer_spad_read(device_t dev, unsigned int idx, uint32_t *val)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
-	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP))
-		*val = ntb_mw_read(4, XEON_SPAD_OFFSET + idx * 4);
+	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
+		*val = intel_ntb_mw_read(4, XEON_SPAD_OFFSET + idx * 4);
 	else
-		*val = ntb_reg_read(4, ntb->peer_reg->spad + idx * 4);
+		*val = intel_ntb_reg_read(4, ntb->peer_reg->spad + idx * 4);
 
 	return (0);
 }
 
-/*
- * ntb_mw_get_range() - get the range of a memory window
- * @ntb:        NTB device context
- * @idx:        Memory window number
- * @base:       OUT - the base address for mapping the memory window
- * @size:       OUT - the size for mapping the memory window
- * @align:      OUT - the base alignment for translating the memory window
- * @align_size: OUT - the size alignment for translating the memory window
- *
- * Get the range of a memory window.  NULL may be given for any output
- * parameter if the value is not needed.  The base and size may be used for
- * mapping the memory window, to access the peer memory.  The alignment and
- * size may be used for translating the memory window, for the peer to access
- * memory on the local system.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_get_range(struct ntb_softc *ntb, unsigned mw_idx, vm_paddr_t *base,
+static int
+intel_ntb_mw_get_range(device_t dev, unsigned mw_idx, vm_paddr_t *base,
     caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
     bus_addr_t *plimit)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	bus_addr_t limit;
 	size_t bar_b2b_off;
 	enum ntb_bar bar_num;
 
-	if (mw_idx >= ntb_mw_count(ntb))
+	if (mw_idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
-	mw_idx = ntb_user_mw_to_idx(ntb, mw_idx);
+	mw_idx = intel_ntb_user_mw_to_idx(ntb, mw_idx);
 
-	bar_num = ntb_mw_to_bar(ntb, mw_idx);
+	bar_num = intel_ntb_mw_to_bar(ntb, mw_idx);
 	bar = &ntb->bar_info[bar_num];
 	bar_b2b_off = 0;
 	if (mw_idx == ntb->b2b_mw_idx) {
@@ -3053,37 +2849,21 @@ ntb_mw_get_range(struct ntb_softc *ntb, unsigned mw_idx, vm_paddr_t *base,
 	return (0);
 }
 
-/*
- * ntb_mw_set_trans() - set the translation of a memory window
- * @ntb:        NTB device context
- * @idx:        Memory window number
- * @addr:       The dma address local memory to expose to the peer
- * @size:       The size of the local memory to expose to the peer
- *
- * Set the translation of a memory window.  The peer may access local memory
- * through the window starting at the address, up to the size.  The address
- * must be aligned to the alignment specified by ntb_mw_get_range().  The size
- * must be aligned to the size alignment specified by ntb_mw_get_range().  The
- * address must be below the plimit specified by ntb_mw_get_range() (i.e. for
- * 32-bit BARs).
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
-    size_t size)
+static int
+intel_ntb_mw_set_trans(device_t dev, unsigned idx, bus_addr_t addr, size_t size)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	uint64_t base, limit, reg_val;
 	size_t bar_size, mw_size;
 	uint32_t base_reg, xlat_reg, limit_reg;
 	enum ntb_bar bar_num;
 
-	if (idx >= ntb_mw_count(ntb))
+	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
-	idx = ntb_user_mw_to_idx(ntb, idx);
+	idx = intel_ntb_user_mw_to_idx(ntb, idx);
 
-	bar_num = ntb_mw_to_bar(ntb, idx);
+	bar_num = intel_ntb_mw_to_bar(ntb, idx);
 	bar = &ntb->bar_info[bar_num];
 
 	bar_size = bar->size;
@@ -3103,25 +2883,25 @@ ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
 
 	limit = 0;
 	if (bar_is_64bit(ntb, bar_num)) {
-		base = ntb_reg_read(8, base_reg) & BAR_HIGH_MASK;
+		base = intel_ntb_reg_read(8, base_reg) & BAR_HIGH_MASK;
 
 		if (limit_reg != 0 && size != mw_size)
 			limit = base + size;
 
 		/* Set and verify translation address */
-		ntb_reg_write(8, xlat_reg, addr);
-		reg_val = ntb_reg_read(8, xlat_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(8, xlat_reg, addr);
+		reg_val = intel_ntb_reg_read(8, xlat_reg) & BAR_HIGH_MASK;
 		if (reg_val != addr) {
-			ntb_reg_write(8, xlat_reg, 0);
+			intel_ntb_reg_write(8, xlat_reg, 0);
 			return (EIO);
 		}
 
 		/* Set and verify the limit */
-		ntb_reg_write(8, limit_reg, limit);
-		reg_val = ntb_reg_read(8, limit_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(8, limit_reg, limit);
+		reg_val = intel_ntb_reg_read(8, limit_reg) & BAR_HIGH_MASK;
 		if (reg_val != limit) {
-			ntb_reg_write(8, limit_reg, base);
-			ntb_reg_write(8, xlat_reg, 0);
+			intel_ntb_reg_write(8, limit_reg, base);
+			intel_ntb_reg_write(8, xlat_reg, 0);
 			return (EIO);
 		}
 	} else {
@@ -3132,98 +2912,72 @@ ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
 		if (((addr + size) & UINT32_MAX) != (addr + size))
 			return (ERANGE);
 
-		base = ntb_reg_read(4, base_reg) & BAR_HIGH_MASK;
+		base = intel_ntb_reg_read(4, base_reg) & BAR_HIGH_MASK;
 
 		if (limit_reg != 0 && size != mw_size)
 			limit = base + size;
 
 		/* Set and verify translation address */
-		ntb_reg_write(4, xlat_reg, addr);
-		reg_val = ntb_reg_read(4, xlat_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(4, xlat_reg, addr);
+		reg_val = intel_ntb_reg_read(4, xlat_reg) & BAR_HIGH_MASK;
 		if (reg_val != addr) {
-			ntb_reg_write(4, xlat_reg, 0);
+			intel_ntb_reg_write(4, xlat_reg, 0);
 			return (EIO);
 		}
 
 		/* Set and verify the limit */
-		ntb_reg_write(4, limit_reg, limit);
-		reg_val = ntb_reg_read(4, limit_reg) & BAR_HIGH_MASK;
+		intel_ntb_reg_write(4, limit_reg, limit);
+		reg_val = intel_ntb_reg_read(4, limit_reg) & BAR_HIGH_MASK;
 		if (reg_val != limit) {
-			ntb_reg_write(4, limit_reg, base);
-			ntb_reg_write(4, xlat_reg, 0);
+			intel_ntb_reg_write(4, limit_reg, base);
+			intel_ntb_reg_write(4, xlat_reg, 0);
 			return (EIO);
 		}
 	}
 	return (0);
 }
 
-/*
- * ntb_mw_clear_trans() - clear the translation of a memory window
- * @ntb:	NTB device context
- * @idx:	Memory window number
- *
- * Clear the translation of a memory window.  The peer may no longer access
- * local memory through the window.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_clear_trans(struct ntb_softc *ntb, unsigned mw_idx)
+static int
+intel_ntb_mw_clear_trans(device_t dev, unsigned mw_idx)
 {
 
-	return (ntb_mw_set_trans(ntb, mw_idx, 0, 0));
+	return (intel_ntb_mw_set_trans(dev, mw_idx, 0, 0));
 }
 
-/*
- * ntb_mw_get_wc - Get the write-combine status of a memory window
- *
- * Returns:  Zero on success, setting *wc; otherwise an error number (e.g. if
- * idx is an invalid memory window).
- *
- * Mode is a VM_MEMATTR_* type.
- */
-int
-ntb_mw_get_wc(struct ntb_softc *ntb, unsigned idx, vm_memattr_t *mode)
+static int
+intel_ntb_mw_get_wc(device_t dev, unsigned idx, vm_memattr_t *mode)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 
-	if (idx >= ntb_mw_count(ntb))
+	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
-	idx = ntb_user_mw_to_idx(ntb, idx);
+	idx = intel_ntb_user_mw_to_idx(ntb, idx);
 
-	bar = &ntb->bar_info[ntb_mw_to_bar(ntb, idx)];
+	bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
 	*mode = bar->map_mode;
 	return (0);
 }
 
-/*
- * ntb_mw_set_wc - Set the write-combine status of a memory window
- *
- * If 'mode' matches the current status, this does nothing and succeeds.  Mode
- * is a VM_MEMATTR_* type.
- *
- * Returns:  Zero on success, setting the caching attribute on the virtual
- * mapping of the BAR; otherwise an error number (e.g. if idx is an invalid
- * memory window, or if changing the caching attribute fails).
- */
-int
-ntb_mw_set_wc(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
+static int
+intel_ntb_mw_set_wc(device_t dev, unsigned idx, vm_memattr_t mode)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
-	if (idx >= ntb_mw_count(ntb))
+	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
 
-	idx = ntb_user_mw_to_idx(ntb, idx);
-	return (ntb_mw_set_wc_internal(ntb, idx, mode));
+	idx = intel_ntb_user_mw_to_idx(ntb, idx);
+	return (intel_ntb_mw_set_wc_internal(ntb, idx, mode));
 }
 
 static int
-ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
+intel_ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
 {
 	struct ntb_pci_bar_info *bar;
 	int rc;
 
-	bar = &ntb->bar_info[ntb_mw_to_bar(ntb, idx)];
+	bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
 	if (bar->map_mode == mode)
 		return (0);
 
@@ -3234,26 +2988,19 @@ ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
 	return (rc);
 }
 
-/**
- * ntb_peer_db_set() - Set the doorbell on the secondary/external side
- * @ntb: pointer to ntb_softc instance
- * @bit: doorbell bits to ring
- *
- * This function allows triggering of a doorbell on the secondary/external
- * side that will initiate an interrupt on the remote host
- */
-void
-ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
+static void
+intel_ntb_peer_db_set(device_t dev, uint64_t bit)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
-	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		struct ntb_pci_bar_info *lapic;
 		unsigned i;
 
 		lapic = ntb->peer_lapic_bar;
 
 		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
-			if ((bit & ntb_db_vector_mask(ntb, i)) != 0)
+			if ((bit & intel_ntb_db_vector_mask(dev, i)) != 0)
 				bus_space_write_4(lapic->pci_bus_tag,
 				    lapic->pci_bus_handle,
 				    ntb->peer_msix_data[i].nmd_ofs,
@@ -3262,99 +3009,76 @@ ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
 		return;
 	}
 
-	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
-		ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
+	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
+		intel_ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
 		return;
 	}
 
 	db_iowrite(ntb, ntb->peer_reg->db_bell, bit);
 }
 
-/*
- * ntb_get_peer_db_addr() - Return the address of the remote doorbell register,
- * as well as the size of the register (via *sz_out).
- *
- * This function allows a caller using I/OAT DMA to chain the remote doorbell
- * ring to its memory window write.
- *
- * Note that writing the peer doorbell via a memory window will *not* generate
- * an interrupt on the remote host; that must be done seperately.
- */
-bus_addr_t
-ntb_get_peer_db_addr(struct ntb_softc *ntb, vm_size_t *sz_out)
+static int
+intel_ntb_peer_db_addr(device_t dev, bus_addr_t *db_addr, vm_size_t *db_size)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	uint64_t regoff;
 
-	KASSERT(sz_out != NULL, ("must be non-NULL"));
+	KASSERT((db_addr != NULL && db_size != NULL), ("must be non-NULL"));
 
-	if (!HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+	if (!HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
 		bar = &ntb->bar_info[NTB_CONFIG_BAR];
 		regoff = ntb->peer_reg->db_bell;
 	} else {
 		KASSERT(ntb->b2b_mw_idx != B2B_MW_DISABLED,
 		    ("invalid b2b idx"));
 
-		bar = &ntb->bar_info[ntb_mw_to_bar(ntb, ntb->b2b_mw_idx)];
+		bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx)];
 		regoff = XEON_PDOORBELL_OFFSET;
 	}
 	KASSERT(bar->pci_bus_tag != X86_BUS_SPACE_IO, ("uh oh"));
 
-	*sz_out = ntb->reg->db_size;
 	/* HACK: Specific to current x86 bus implementation. */
-	return ((uint64_t)bar->pci_bus_handle + regoff);
+	*db_addr = ((uint64_t)bar->pci_bus_handle + regoff);
+	*db_size = ntb->reg->db_size;
+	return (0);
 }
 
-/*
- * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
- * @ntb:	NTB device context
- *
- * Hardware may support different number or arrangement of doorbell bits.
- *
- * Return: A mask of doorbell bits supported by the ntb.
- */
-uint64_t
-ntb_db_valid_mask(struct ntb_softc *ntb)
+static uint64_t
+intel_ntb_db_valid_mask(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	return (ntb->db_valid_mask);
 }
 
-/*
- * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
- * @ntb:	NTB device context
- * @vector:	Doorbell vector number
- *
- * Each interrupt vector may have a different number or arrangement of bits.
- *
- * Return: A mask of doorbell bits serviced by a vector.
- */
-uint64_t
-ntb_db_vector_mask(struct ntb_softc *ntb, uint32_t vector)
+static int
+intel_ntb_db_vector_count(device_t dev)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
+
+	return (ntb->db_vec_count);
+}
+
+static uint64_t
+intel_ntb_db_vector_mask(device_t dev, uint32_t vector)
+{
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (vector > ntb->db_vec_count)
 		return (0);
-	return (ntb->db_valid_mask & ntb_vec_mask(ntb, vector));
+	return (ntb->db_valid_mask & intel_ntb_vec_mask(ntb, vector));
 }
 
-/**
- * ntb_link_is_up() - get the current ntb link state
- * @ntb:        NTB device context
- * @speed:      OUT - The link speed expressed as PCIe generation number
- * @width:      OUT - The link width expressed as the number of PCIe lanes
- *
- * RETURNS: true or false based on the hardware link state
- */
-bool
-ntb_link_is_up(struct ntb_softc *ntb, enum ntb_speed *speed,
-    enum ntb_width *width)
+static bool
+intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed, enum ntb_width *width)
 {
+	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (speed != NULL)
-		*speed = ntb_link_sta_speed(ntb);
+		*speed = intel_ntb_link_sta_speed(ntb);
 	if (width != NULL)
-		*width = ntb_link_sta_width(ntb);
+		*width = intel_ntb_link_sta_width(ntb);
 	return (link_is_up(ntb));
 }
 
@@ -3369,17 +3093,42 @@ save_bar_parameters(struct ntb_pci_bar_info *bar)
 	bar->vbase = rman_get_virtual(bar->pci_resource);
 }
 
-device_t
-ntb_get_device(struct ntb_softc *ntb)
-{
-
-	return (ntb->device);
-}
-
-/* Export HW-specific errata information. */
-bool
-ntb_has_feature(struct ntb_softc *ntb, uint32_t feature)
-{
+static device_method_t ntb_intel_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		intel_ntb_probe),
+	DEVMETHOD(device_attach,	intel_ntb_attach),
+	DEVMETHOD(device_detach,	intel_ntb_detach),
+	/* NTB interface */
+	DEVMETHOD(ntb_link_is_up,	intel_ntb_link_is_up),
+	DEVMETHOD(ntb_link_enable,	intel_ntb_link_enable),
+	DEVMETHOD(ntb_link_disable,	intel_ntb_link_disable),
+	DEVMETHOD(ntb_link_enabled,	intel_ntb_link_enabled),
+	DEVMETHOD(ntb_mw_count,		intel_ntb_mw_count),
+	DEVMETHOD(ntb_mw_get_range,	intel_ntb_mw_get_range),
+	DEVMETHOD(ntb_mw_set_trans,	intel_ntb_mw_set_trans),
+	DEVMETHOD(ntb_mw_clear_trans,	intel_ntb_mw_clear_trans),
+	DEVMETHOD(ntb_mw_get_wc,	intel_ntb_mw_get_wc),
+	DEVMETHOD(ntb_mw_set_wc,	intel_ntb_mw_set_wc),
+	DEVMETHOD(ntb_spad_count,	intel_ntb_spad_count),
+	DEVMETHOD(ntb_spad_clear,	intel_ntb_spad_clear),
+	DEVMETHOD(ntb_spad_write,	intel_ntb_spad_write),
+	DEVMETHOD(ntb_spad_read,	intel_ntb_spad_read),
+	DEVMETHOD(ntb_peer_spad_write,	intel_ntb_peer_spad_write),
+	DEVMETHOD(ntb_peer_spad_read,	intel_ntb_peer_spad_read),
+	DEVMETHOD(ntb_db_valid_mask,	intel_ntb_db_valid_mask),
+	DEVMETHOD(ntb_db_vector_count,	intel_ntb_db_vector_count),
+	DEVMETHOD(ntb_db_vector_mask,	intel_ntb_db_vector_mask),
+	DEVMETHOD(ntb_db_clear,		intel_ntb_db_clear),
+	DEVMETHOD(ntb_db_clear_mask,	intel_ntb_db_clear_mask),
+	DEVMETHOD(ntb_db_read,		intel_ntb_db_read),
+	DEVMETHOD(ntb_db_set_mask,	intel_ntb_db_set_mask),
+	DEVMETHOD(ntb_peer_db_addr,	intel_ntb_peer_db_addr),
+	DEVMETHOD(ntb_peer_db_set,	intel_ntb_peer_db_set),
+	DEVMETHOD_END
+};
 
-	return (HAS_FEATURE(feature));
-}
+static DEFINE_CLASS_0(ntb_hw, ntb_intel_driver, ntb_intel_methods,
+    sizeof(struct ntb_softc));
+DRIVER_MODULE(ntb_intel, pci, ntb_intel_driver, ntb_hw_devclass, NULL, NULL);
+MODULE_DEPEND(ntb_intel, ntb, 1, 1, 1);
+MODULE_VERSION(ntb_intel, 1);
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.h b/sys/dev/ntb/ntb_hw/ntb_hw.h
deleted file mode 100644
index f05acda..0000000
--- a/sys/dev/ntb/ntb_hw/ntb_hw.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*-
- * Copyright (C) 2013 Intel Corporation
- * Copyright (C) 2015 EMC Corporation
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _NTB_HW_H_
-#define _NTB_HW_H_
-
-struct ntb_softc;
-
-#define NTB_MAX_NUM_MW	3
-
-enum ntb_speed {
-	NTB_SPEED_AUTO = -1,
-	NTB_SPEED_NONE = 0,
-	NTB_SPEED_GEN1 = 1,
-	NTB_SPEED_GEN2 = 2,
-	NTB_SPEED_GEN3 = 3,
-};
-
-enum ntb_width {
-	NTB_WIDTH_AUTO = -1,
-	NTB_WIDTH_NONE = 0,
-	NTB_WIDTH_1 = 1,
-	NTB_WIDTH_2 = 2,
-	NTB_WIDTH_4 = 4,
-	NTB_WIDTH_8 = 8,
-	NTB_WIDTH_12 = 12,
-	NTB_WIDTH_16 = 16,
-	NTB_WIDTH_32 = 32,
-};
-
-SYSCTL_DECL(_hw_ntb);
-
-typedef void (*ntb_db_callback)(void *data, uint32_t vector);
-typedef void (*ntb_event_callback)(void *data);
-
-struct ntb_ctx_ops {
-	ntb_event_callback	link_event;
-	ntb_db_callback		db_event;
-};
-
-device_t ntb_get_device(struct ntb_softc *);
-
-bool ntb_link_is_up(struct ntb_softc *, enum ntb_speed *, enum ntb_width *);
-void ntb_link_event(struct ntb_softc *);
-int ntb_link_enable(struct ntb_softc *, enum ntb_speed, enum ntb_width);
-int ntb_link_disable(struct ntb_softc *);
-bool ntb_link_enabled(struct ntb_softc *);
-
-int ntb_set_ctx(struct ntb_softc *, void *, const struct ntb_ctx_ops *);
-void *ntb_get_ctx(struct ntb_softc *, const struct ntb_ctx_ops **);
-void ntb_clear_ctx(struct ntb_softc *);
-
-uint8_t ntb_mw_count(struct ntb_softc *);
-int ntb_mw_get_range(struct ntb_softc *, unsigned mw_idx, vm_paddr_t *base,
-    caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
-    bus_addr_t *plimit);
-int ntb_mw_set_trans(struct ntb_softc *, unsigned mw_idx, bus_addr_t, size_t);
-int ntb_mw_clear_trans(struct ntb_softc *, unsigned mw_idx);
-
-int ntb_mw_get_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t *mode);
-int ntb_mw_set_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t mode);
-
-uint8_t ntb_get_max_spads(struct ntb_softc *ntb);
-void ntb_spad_clear(struct ntb_softc *ntb);
-int ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val);
-int ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val);
-int ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx,
-    uint32_t val);
-int ntb_peer_spad_read(struct ntb_softc *ntb, unsigned int idx,
-    uint32_t *val);
-
-uint64_t ntb_db_valid_mask(struct ntb_softc *);
-uint64_t ntb_db_vector_mask(struct ntb_softc *, uint32_t vector);
-bus_addr_t ntb_get_peer_db_addr(struct ntb_softc *, vm_size_t *sz_out);
-
-void ntb_db_clear(struct ntb_softc *, uint64_t bits);
-void ntb_db_clear_mask(struct ntb_softc *, uint64_t bits);
-uint64_t ntb_db_read(struct ntb_softc *);
-void ntb_db_set_mask(struct ntb_softc *, uint64_t bits);
-void ntb_peer_db_set(struct ntb_softc *, uint64_t bits);
-
-#define XEON_SPAD_COUNT		16
-#define ATOM_SPAD_COUNT		16
-
-/* Hardware owns the low 16 bits of features. */
-#define NTB_BAR_SIZE_4K		(1 << 0)
-#define NTB_SDOORBELL_LOCKUP	(1 << 1)
-#define NTB_SB01BASE_LOCKUP	(1 << 2)
-#define NTB_B2BDOORBELL_BIT14	(1 << 3)
-/* Software/configuration owns the top 16 bits. */
-#define NTB_SPLIT_BAR		(1ull << 16)
-
-#define NTB_FEATURES_STR \
-    "\20\21SPLIT_BAR4\04B2B_DOORBELL_BIT14\03SB01BASE_LOCKUP" \
-    "\02SDOORBELL_LOCKUP\01BAR_SIZE_4K"
-
-bool ntb_has_feature(struct ntb_softc *, uint32_t);
-
-#endif /* _NTB_HW_H_ */
diff --git a/sys/dev/ntb/ntb_hw/ntb_regs.h b/sys/dev/ntb/ntb_hw/ntb_regs.h
index fb445d7..a037736 100644
--- a/sys/dev/ntb/ntb_hw/ntb_regs.h
+++ b/sys/dev/ntb/ntb_hw/ntb_regs.h
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
@@ -76,6 +77,7 @@
 #define XEON_SDBMSK_OFFSET	0x0066
 #define XEON_USMEMMISS_OFFSET	0x0070
 #define XEON_SPAD_OFFSET	0x0080
+#define XEON_SPAD_COUNT		16
 #define XEON_SPADSEMA4_OFFSET	0x00c0
 #define XEON_WCCNTRL_OFFSET	0x00e0
 #define XEON_UNCERRSTS_OFFSET	0x014c
@@ -104,6 +106,7 @@
 #define ATOM_NTBCNTL_OFFSET	0x0060
 #define ATOM_EBDF_OFFSET		0x0064
 #define ATOM_SPAD_OFFSET		0x0080
+#define ATOM_SPAD_COUNT		16
 #define ATOM_SPADSEMA_OFFSET	0x00c0
 #define ATOM_STKYSPAD_OFFSET	0x00c4
 #define ATOM_PBAR2XLAT_OFFSET	0x8008
diff --git a/sys/dev/ntb/ntb_if.m b/sys/dev/ntb/ntb_if.m
new file mode 100644
index 0000000..d8ca227
--- /dev/null
+++ b/sys/dev/ntb/ntb_if.m
@@ -0,0 +1,210 @@
+#-
+# Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+INTERFACE ntb;
+
+HEADER {
+	enum ntb_speed {
+		NTB_SPEED_AUTO = -1,
+		NTB_SPEED_NONE = 0,
+		NTB_SPEED_GEN1 = 1,
+		NTB_SPEED_GEN2 = 2,
+		NTB_SPEED_GEN3 = 3,
+	};
+
+	enum ntb_width {
+		NTB_WIDTH_AUTO = -1,
+		NTB_WIDTH_NONE = 0,
+		NTB_WIDTH_1 = 1,
+		NTB_WIDTH_2 = 2,
+		NTB_WIDTH_4 = 4,
+		NTB_WIDTH_8 = 8,
+		NTB_WIDTH_12 = 12,
+		NTB_WIDTH_16 = 16,
+		NTB_WIDTH_32 = 32,
+	};
+
+	typedef void (*ntb_db_callback)(void *data, uint32_t vector);
+	typedef void (*ntb_event_callback)(void *data);
+	struct ntb_ctx_ops {
+		ntb_event_callback	link_event;
+		ntb_db_callback		db_event;
+	};
+};
+
+METHOD bool link_is_up {
+	device_t	 ntb;
+	enum ntb_speed	*speed;
+	enum ntb_width	*width;
+};
+
+METHOD int link_enable {
+	device_t	 ntb;
+	enum ntb_speed	 speed;
+	enum ntb_width	 width;
+};
+
+METHOD int link_disable {
+	device_t	 ntb;
+};
+
+METHOD bool link_enabled {
+	device_t	 ntb;
+};
+
+METHOD int set_ctx {
+	device_t	 ntb;
+	void		*ctx;
+	const struct ntb_ctx_ops *ctx_ops;
+};
+
+METHOD void * get_ctx {
+	device_t	 ntb;
+	const struct ntb_ctx_ops **ctx_ops;
+};
+
+METHOD void clear_ctx {
+	device_t	 ntb;
+};
+
+METHOD uint8_t mw_count {
+	device_t	 ntb;
+};
+
+METHOD int mw_get_range {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	vm_paddr_t	*base;
+	caddr_t		*vbase;
+	size_t		*size;
+	size_t		*align;
+	size_t		*align_size;
+	bus_addr_t	*plimit;
+};
+
+METHOD int mw_set_trans {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	bus_addr_t	 addr;
+	size_t		 size;
+};
+
+METHOD int mw_clear_trans {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+};
+
+METHOD int mw_get_wc {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	vm_memattr_t	*mode;
+};
+
+METHOD int mw_set_wc {
+	device_t	 ntb;
+	unsigned	 mw_idx;
+	vm_memattr_t	 mode;
+};
+
+METHOD uint8_t spad_count {
+	device_t	 ntb;
+};
+
+METHOD void spad_clear {
+	device_t	 ntb;
+};
+
+METHOD int spad_write {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	 val;
+};
+
+METHOD int spad_read {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	 *val;
+};
+
+METHOD int peer_spad_write {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	 val;
+};
+
+METHOD int peer_spad_read {
+	device_t	 ntb;
+	unsigned int	 idx;
+	uint32_t	*val;
+};
+
+METHOD uint64_t db_valid_mask {
+	device_t	 ntb;
+};
+
+METHOD int db_vector_count {
+	device_t	 ntb;
+};
+
+METHOD uint64_t db_vector_mask {
+	device_t	 ntb;
+	uint32_t	 vector;
+};
+
+METHOD int peer_db_addr {
+	device_t	 ntb;
+	bus_addr_t	*db_addr;
+	vm_size_t	*db_size;
+};
+
+METHOD void db_clear {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
+
+METHOD void db_clear_mask {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
+
+METHOD uint64_t db_read {
+	device_t	 ntb;
+};
+
+METHOD void db_set_mask {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
+
+METHOD void peer_db_set {
+	device_t	 ntb;
+	uint64_t	 bits;
+};
diff --git a/sys/dev/ntb/ntb_transport.c b/sys/dev/ntb/ntb_transport.c
new file mode 100644
index 0000000..5297db9
--- /dev/null
+++ b/sys/dev/ntb/ntb_transport.c
@@ -0,0 +1,1521 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * Copyright (C) 2013 Intel Corporation
+ * Copyright (C) 2015 EMC Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a transport for sending and receiving messages by
+ * writing to remote memory window(s) provided by underlying NTB device.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+
+#include "ntb.h"
+#include "ntb_transport.h"
+
+#define KTR_NTB KTR_SPARE3
+
+#define NTB_TRANSPORT_VERSION	4
+
+static SYSCTL_NODE(_hw, OID_AUTO, ntb_transport, CTLFLAG_RW, 0, "ntb_transport");
+
+static unsigned g_ntb_transport_debug_level;
+TUNABLE_INT("hw.ntb_transport.debug_level", &g_ntb_transport_debug_level);
+SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, debug_level, CTLFLAG_RWTUN,
+    &g_ntb_transport_debug_level, 0,
+    "ntb_transport log level -- higher is more verbose");
+#define ntb_printf(lvl, ...) do {			\
+	if ((lvl) <= g_ntb_transport_debug_level) {	\
+		printf(__VA_ARGS__);			\
+	}						\
+} while (0)
+
+static unsigned transport_mtu = 0x10000;
+
+static uint64_t max_mw_size;
+TUNABLE_QUAD("hw.ntb_transport.max_mw_size", &max_mw_size);
+SYSCTL_UQUAD(_hw_ntb_transport, OID_AUTO, max_mw_size, CTLFLAG_RDTUN, &max_mw_size, 0,
+    "If enabled (non-zero), limit the size of large memory windows. "
+    "Both sides of the NTB MUST set the same value here.");
+
+static unsigned enable_xeon_watchdog;
+TUNABLE_INT("hw.ntb_transport.enable_xeon_watchdog", &enable_xeon_watchdog);
+SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, enable_xeon_watchdog, CTLFLAG_RDTUN,
+    &enable_xeon_watchdog, 0, "If non-zero, write a register every second to "
+    "keep a watchdog from tearing down the NTB link");
+
+STAILQ_HEAD(ntb_queue_list, ntb_queue_entry);
+
+typedef uint32_t ntb_q_idx_t;
+
+struct ntb_queue_entry {
+	/* ntb_queue list reference */
+	STAILQ_ENTRY(ntb_queue_entry) entry;
+
+	/* info on data to be transferred */
+	void		*cb_data;
+	void		*buf;
+	uint32_t	len;
+	uint32_t	flags;
+
+	struct ntb_transport_qp		*qp;
+	struct ntb_payload_header	*x_hdr;
+	ntb_q_idx_t	index;
+};
+
+struct ntb_rx_info {
+	ntb_q_idx_t	entry;
+};
+
+struct ntb_transport_qp {
+	struct ntb_transport_ctx	*transport;
+	device_t		 dev;
+
+	void			*cb_data;
+
+	bool			client_ready;
+	volatile bool		link_is_up;
+	uint8_t			qp_num;	/* Only 64 QPs are allowed.  0-63 */
+
+	struct ntb_rx_info	*rx_info;
+	struct ntb_rx_info	*remote_rx_info;
+
+	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	struct ntb_queue_list	tx_free_q;
+	struct mtx		ntb_tx_free_q_lock;
+	caddr_t			tx_mw;
+	bus_addr_t		tx_mw_phys;
+	ntb_q_idx_t		tx_index;
+	ntb_q_idx_t		tx_max_entry;
+	uint64_t		tx_max_frame;
+
+	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	struct ntb_queue_list	rx_post_q;
+	struct ntb_queue_list	rx_pend_q;
+	/* ntb_rx_q_lock: synchronize access to rx_XXXX_q */
+	struct mtx		ntb_rx_q_lock;
+	struct task		rxc_db_work;
+	struct taskqueue	*rxc_tq;
+	caddr_t			rx_buff;
+	ntb_q_idx_t		rx_index;
+	ntb_q_idx_t		rx_max_entry;
+	uint64_t		rx_max_frame;
+
+	void (*event_handler)(void *data, enum ntb_link_event status);
+	struct callout		link_work;
+	struct callout		rx_full;
+
+	uint64_t		last_rx_no_buf;
+
+	/* Stats */
+	uint64_t		rx_bytes;
+	uint64_t		rx_pkts;
+	uint64_t		rx_ring_empty;
+	uint64_t		rx_err_no_buf;
+	uint64_t		rx_err_oflow;
+	uint64_t		rx_err_ver;
+	uint64_t		tx_bytes;
+	uint64_t		tx_pkts;
+	uint64_t		tx_ring_full;
+	uint64_t		tx_err_no_buf;
+
+	struct mtx		tx_lock;
+};
+
+struct ntb_transport_mw {
+	vm_paddr_t	phys_addr;
+	size_t		phys_size;
+	size_t		xlat_align;
+	size_t		xlat_align_size;
+	bus_addr_t	addr_limit;
+	/* Tx buff is off vbase / phys_addr */
+	caddr_t		vbase;
+	size_t		xlat_size;
+	size_t		buff_size;
+	/* Rx buff is off virt_addr / dma_addr */
+	caddr_t		virt_addr;
+	bus_addr_t	dma_addr;
+};
+
+struct ntb_transport_child {
+	device_t	dev;
+	int		qpoff;
+	int		qpcnt;
+	struct ntb_transport_child *next;
+};
+
+struct ntb_transport_ctx {
+	device_t		 dev;
+	struct ntb_transport_child *child;
+	struct ntb_transport_mw	*mw_vec;
+	struct ntb_transport_qp	*qp_vec;
+	unsigned		mw_count;
+	unsigned		qp_count;
+	uint64_t		qp_bitmap;
+	volatile bool		link_is_up;
+	struct callout		link_work;
+	struct callout		link_watchdog;
+	struct task		link_cleanup;
+};
+
+enum {
+	NTBT_DESC_DONE_FLAG = 1 << 0,
+	NTBT_LINK_DOWN_FLAG = 1 << 1,
+};
+
+struct ntb_payload_header {
+	ntb_q_idx_t ver;
+	uint32_t len;
+	uint32_t flags;
+};
+
+enum {
+	/*
+	 * The order of this enum is part of the remote protocol.  Do not
+	 * reorder without bumping protocol version (and it's probably best
+	 * to keep the protocol in lock-step with the Linux NTB driver.
+	 */
+	NTBT_VERSION = 0,
+	NTBT_QP_LINKS,
+	NTBT_NUM_QPS,
+	NTBT_NUM_MWS,
+	/*
+	 * N.B.: transport_link_work assumes MW1 enums = MW0 + 2.
+	 */
+	NTBT_MW0_SZ_HIGH,
+	NTBT_MW0_SZ_LOW,
+	NTBT_MW1_SZ_HIGH,
+	NTBT_MW1_SZ_LOW,
+
+	/*
+	 * Some NTB-using hardware have a watchdog to work around NTB hangs; if
+	 * a register or doorbell isn't written every few seconds, the link is
+	 * torn down.  Write an otherwise unused register every few seconds to
+	 * work around this watchdog.
+	 */
+	NTBT_WATCHDOG_SPAD = 15
+};
+
+#define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
+#define NTB_QP_DEF_NUM_ENTRIES	100
+#define NTB_LINK_DOWN_TIMEOUT	10
+
+static int ntb_transport_probe(device_t dev);
+static int ntb_transport_attach(device_t dev);
+static int ntb_transport_detach(device_t dev);
+static void ntb_transport_init_queue(struct ntb_transport_ctx *nt,
+    unsigned int qp_num);
+static int ntb_process_tx(struct ntb_transport_qp *qp,
+    struct ntb_queue_entry *entry);
+static void ntb_transport_rxc_db(void *arg, int pending);
+static int ntb_process_rxc(struct ntb_transport_qp *qp);
+static void ntb_memcpy_rx(struct ntb_transport_qp *qp,
+    struct ntb_queue_entry *entry, void *offset);
+static inline void ntb_rx_copy_callback(struct ntb_transport_qp *qp,
+    void *data);
+static void ntb_complete_rxc(struct ntb_transport_qp *qp);
+static void ntb_transport_doorbell_callback(void *data, uint32_t vector);
+static void ntb_transport_event_callback(void *data);
+static void ntb_transport_link_work(void *arg);
+static int ntb_set_mw(struct ntb_transport_ctx *, int num_mw, size_t size);
+static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw);
+static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
+    unsigned int qp_num);
+static void ntb_qp_link_work(void *arg);
+static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt);
+static void ntb_transport_link_cleanup_work(void *, int);
+static void ntb_qp_link_down(struct ntb_transport_qp *qp);
+static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp);
+static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp);
+static void ntb_send_link_down(struct ntb_transport_qp *qp);
+static void ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
+    struct ntb_queue_list *list);
+static struct ntb_queue_entry *ntb_list_rm(struct mtx *lock,
+    struct ntb_queue_list *list);
+static struct ntb_queue_entry *ntb_list_mv(struct mtx *lock,
+    struct ntb_queue_list *from, struct ntb_queue_list *to);
+static void xeon_link_watchdog_hb(void *);
+
+static const struct ntb_ctx_ops ntb_transport_ops = {
+	.link_event = ntb_transport_event_callback,
+	.db_event = ntb_transport_doorbell_callback,
+};
+
+MALLOC_DEFINE(M_NTB_T, "ntb_transport", "ntb transport driver");
+
+static inline void
+iowrite32(uint32_t val, void *addr)
+{
+
+	bus_space_write_4(X86_BUS_SPACE_MEM, 0/* HACK */, (uintptr_t)addr,
+	    val);
+}
+
+/* Transport Init and teardown */
+
+static void
+xeon_link_watchdog_hb(void *arg)
+{
+	struct ntb_transport_ctx *nt;
+
+	nt = arg;
+	ntb_spad_write(nt->dev, NTBT_WATCHDOG_SPAD, 0);
+	callout_reset(&nt->link_watchdog, 1 * hz, xeon_link_watchdog_hb, nt);
+}
+
+static int
+ntb_transport_probe(device_t dev)
+{
+
+	device_set_desc(dev, "NTB Transport");
+	return (0);
+}
+
+static int
+ntb_transport_attach(device_t dev)
+{
+	struct ntb_transport_ctx *nt = device_get_softc(dev);
+	struct ntb_transport_child **cpp = &nt->child;
+	struct ntb_transport_child *nc;
+	struct ntb_transport_mw *mw;
+	uint64_t db_bitmap;
+	int rc, i, db_count, spad_count, qp, qpu, qpo, qpt;
+	char cfg[128] = "";
+	char buf[32];
+	char *n, *np, *c, *name;
+
+	nt->dev = dev;
+	nt->mw_count = ntb_mw_count(dev);
+	spad_count = ntb_spad_count(dev);
+	db_bitmap = ntb_db_valid_mask(dev);
+	db_count = flsll(db_bitmap);
+	KASSERT(db_bitmap == (1 << db_count) - 1,
+	    ("Doorbells are not sequential (%jx).\n", db_bitmap));
+
+	device_printf(dev, "%d memory windows, %d scratchpads, "
+	    "%d doorbells\n", nt->mw_count, spad_count, db_count);
+
+	if (nt->mw_count == 0) {
+		device_printf(dev, "At least 1 memory window required.\n");
+		return (ENXIO);
+	}
+	if (spad_count < 6) {
+		device_printf(dev, "At least 6 scratchpads required.\n");
+		return (ENXIO);
+	}
+	if (spad_count < 4 + 2 * nt->mw_count) {
+		nt->mw_count = (spad_count - 4) / 2;
+		device_printf(dev, "Scratchpads enough only for %d "
+		    "memory windows.\n", nt->mw_count);
+	}
+	if (db_bitmap == 0) {
+		device_printf(dev, "At least one doorbell required.\n");
+		return (ENXIO);
+	}
+
+	nt->mw_vec = malloc(nt->mw_count * sizeof(*nt->mw_vec), M_NTB_T,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < nt->mw_count; i++) {
+		mw = &nt->mw_vec[i];
+
+		rc = ntb_mw_get_range(dev, i, &mw->phys_addr, &mw->vbase,
+		    &mw->phys_size, &mw->xlat_align, &mw->xlat_align_size,
+		    &mw->addr_limit);
+		if (rc != 0)
+			goto err;
+
+		mw->buff_size = 0;
+		mw->xlat_size = 0;
+		mw->virt_addr = NULL;
+		mw->dma_addr = 0;
+
+		rc = ntb_mw_set_wc(dev, i, VM_MEMATTR_WRITE_COMBINING);
+		if (rc)
+			ntb_printf(0, "Unable to set mw%d caching\n", i);
+	}
+
+	qpu = 0;
+	qpo = imin(db_count, nt->mw_count);
+	qpt = db_count;
+
+	snprintf(buf, sizeof(buf), "hint.%s.%d.config", device_get_name(dev),
+	    device_get_unit(dev));
+	TUNABLE_STR_FETCH(buf, cfg, sizeof(cfg));
+	n = cfg;
+	i = 0;
+	while ((c = strsep(&n, ",")) != NULL) {
+		np = c;
+		name = strsep(&np, ":");
+		if (name != NULL && name[0] == 0)
+			name = NULL;
+		qp = (np && np[0] != 0) ? strtol(np, NULL, 10) : qpo - qpu;
+		if (qp <= 0)
+			qp = 1;
+
+		if (qp > qpt - qpu) {
+			device_printf(dev, "Not enough resources for config\n");
+			break;
+		}
+
+		nc = malloc(sizeof(*nc), M_DEVBUF, M_WAITOK | M_ZERO);
+		nc->qpoff = qpu;
+		nc->qpcnt = qp;
+		nc->dev = device_add_child(dev, name, -1);
+		if (nc->dev == NULL) {
+			device_printf(dev, "Can not add child.\n");
+			break;
+		}
+		device_set_ivars(nc->dev, nc);
+		*cpp = nc;
+		cpp = &nc->next;
+
+		if (bootverbose) {
+			device_printf(dev, "%d \"%s\": queues %d",
+			    i, name, qpu);
+			if (qp > 1)
+				printf("-%d", qpu + qp - 1);
+			printf("\n");
+		}
+
+		qpu += qp;
+		i++;
+	}
+	nt->qp_count = qpu;
+
+	nt->qp_vec = malloc(nt->qp_count * sizeof(*nt->qp_vec), M_NTB_T,
+	    M_WAITOK | M_ZERO);
+
+	for (i = 0; i < nt->qp_count; i++)
+		ntb_transport_init_queue(nt, i);
+
+	callout_init(&nt->link_work, 0);
+	callout_init(&nt->link_watchdog, 0);
+	TASK_INIT(&nt->link_cleanup, 0, ntb_transport_link_cleanup_work, nt);
+
+	rc = ntb_set_ctx(dev, nt, &ntb_transport_ops);
+	if (rc != 0)
+		goto err;
+
+	nt->link_is_up = false;
+	ntb_link_enable(dev, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+
+	if (enable_xeon_watchdog != 0)
+		callout_reset(&nt->link_watchdog, 0, xeon_link_watchdog_hb, nt);
+
+	bus_generic_attach(dev);
+	return (0);
+
+err:
+	free(nt->qp_vec, M_NTB_T);
+	free(nt->mw_vec, M_NTB_T);
+	return (rc);
+}
+
+static int
+ntb_transport_detach(device_t dev)
+{
+	struct ntb_transport_ctx *nt = device_get_softc(dev);
+	struct ntb_transport_child **cpp = &nt->child;
+	struct ntb_transport_child *nc;
+	int error = 0, i;
+
+	while ((nc = *cpp) != NULL) {
+		*cpp = (*cpp)->next;
+		error = device_delete_child(dev, nc->dev);
+		if (error)
+			break;
+		free(nc, M_DEVBUF);
+	}
+	KASSERT(nt->qp_bitmap == 0,
+	    ("Some queues not freed on detach (%jx)", nt->qp_bitmap));
+
+	ntb_transport_link_cleanup(nt);
+	taskqueue_drain(taskqueue_swi, &nt->link_cleanup);
+	callout_drain(&nt->link_work);
+	callout_drain(&nt->link_watchdog);
+
+	ntb_link_disable(dev);
+	ntb_clear_ctx(dev);
+
+	for (i = 0; i < nt->mw_count; i++)
+		ntb_free_mw(nt, i);
+
+	free(nt->qp_vec, M_NTB_T);
+	free(nt->mw_vec, M_NTB_T);
+	return (0);
+}
+
+int
+ntb_transport_queue_count(device_t dev)
+{
+	struct ntb_transport_child *nc = device_get_ivars(dev);
+
+	return (nc->qpcnt);
+}
+
+static void
+ntb_transport_init_queue(struct ntb_transport_ctx *nt, unsigned int qp_num)
+{
+	struct ntb_transport_mw *mw;
+	struct ntb_transport_qp *qp;
+	vm_paddr_t mw_base;
+	uint64_t mw_size, qp_offset;
+	size_t tx_size;
+	unsigned num_qps_mw, mw_num, mw_count;
+
+	mw_count = nt->mw_count;
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	qp = &nt->qp_vec[qp_num];
+	qp->qp_num = qp_num;
+	qp->transport = nt;
+	qp->dev = nt->dev;
+	qp->client_ready = false;
+	qp->event_handler = NULL;
+	ntb_qp_link_down_reset(qp);
+
+	if (mw_num < nt->qp_count % mw_count)
+		num_qps_mw = nt->qp_count / mw_count + 1;
+	else
+		num_qps_mw = nt->qp_count / mw_count;
+
+	mw_base = mw->phys_addr;
+	mw_size = mw->phys_size;
+
+	tx_size = mw_size / num_qps_mw;
+	qp_offset = tx_size * (qp_num / mw_count);
+
+	qp->tx_mw = mw->vbase + qp_offset;
+	KASSERT(qp->tx_mw != NULL, ("uh oh?"));
+
+	/* XXX Assumes that a vm_paddr_t is equivalent to bus_addr_t */
+	qp->tx_mw_phys = mw_base + qp_offset;
+	KASSERT(qp->tx_mw_phys != 0, ("uh oh?"));
+
+	tx_size -= sizeof(struct ntb_rx_info);
+	qp->rx_info = (void *)(qp->tx_mw + tx_size);
+
+	/* Due to house-keeping, there must be at least 2 buffs */
+	qp->tx_max_frame = qmin(transport_mtu, tx_size / 2);
+	qp->tx_max_entry = tx_size / qp->tx_max_frame;
+
+	callout_init(&qp->link_work, 0);
+	callout_init(&qp->rx_full, 1);
+
+	mtx_init(&qp->ntb_rx_q_lock, "ntb rx q", NULL, MTX_SPIN);
+	mtx_init(&qp->ntb_tx_free_q_lock, "ntb tx free q", NULL, MTX_SPIN);
+	mtx_init(&qp->tx_lock, "ntb transport tx", NULL, MTX_DEF);
+	TASK_INIT(&qp->rxc_db_work, 0, ntb_transport_rxc_db, qp);
+	qp->rxc_tq = taskqueue_create("ntbt_rx", M_WAITOK,
+	    taskqueue_thread_enqueue, &qp->rxc_tq);
+	taskqueue_start_threads(&qp->rxc_tq, 1, PI_NET, "%s rx%d",
+	    device_get_nameunit(nt->dev), qp_num);
+
+	STAILQ_INIT(&qp->rx_post_q);
+	STAILQ_INIT(&qp->rx_pend_q);
+	STAILQ_INIT(&qp->tx_free_q);
+}
+
+void
+ntb_transport_free_queue(struct ntb_transport_qp *qp)
+{
+	struct ntb_transport_ctx *nt = qp->transport;
+	struct ntb_queue_entry *entry;
+
+	callout_drain(&qp->link_work);
+
+	ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
+	taskqueue_drain_all(qp->rxc_tq);
+	taskqueue_free(qp->rxc_tq);
+
+	qp->cb_data = NULL;
+	qp->rx_handler = NULL;
+	qp->tx_handler = NULL;
+	qp->event_handler = NULL;
+
+	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_pend_q)))
+		free(entry, M_NTB_T);
+
+	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_post_q)))
+		free(entry, M_NTB_T);
+
+	while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
+		free(entry, M_NTB_T);
+
+	nt->qp_bitmap &= ~(1 << qp->qp_num);
+}
+
+/**
+ * ntb_transport_create_queue - Create a new NTB transport layer queue
+ * @rx_handler: receive callback function
+ * @tx_handler: transmit callback function
+ * @event_handler: event callback function
+ *
+ * Create a new NTB transport layer queue and provide the queue with a callback
+ * routine for both transmit and receive.  The receive callback routine will be
+ * used to pass up data when the transport has received it on the queue.   The
+ * transmit callback routine will be called when the transport has completed the
+ * transmission of the data on the queue and the data is ready to be freed.
+ *
+ * RETURNS: pointer to newly created ntb_queue, NULL on error.
+ */
+struct ntb_transport_qp *
+ntb_transport_create_queue(device_t dev, int q,
+    const struct ntb_queue_handlers *handlers, void *data)
+{
+	struct ntb_transport_child *nc = device_get_ivars(dev);
+	struct ntb_transport_ctx *nt = device_get_softc(device_get_parent(dev));
+	struct ntb_queue_entry *entry;
+	struct ntb_transport_qp *qp;
+	int i;
+
+	if (q < 0 || q >= nc->qpcnt)
+		return (NULL);
+
+	qp = &nt->qp_vec[nc->qpoff + q];
+	nt->qp_bitmap |= (1 << qp->qp_num);
+	qp->cb_data = data;
+	qp->rx_handler = handlers->rx_handler;
+	qp->tx_handler = handlers->tx_handler;
+	qp->event_handler = handlers->event_handler;
+
+	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+		entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
+		entry->cb_data = data;
+		entry->buf = NULL;
+		entry->len = transport_mtu;
+		entry->qp = qp;
+		ntb_list_add(&qp->ntb_rx_q_lock, entry, &qp->rx_pend_q);
+	}
+
+	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+		entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
+		entry->qp = qp;
+		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+	}
+
+	ntb_db_clear(dev, 1ull << qp->qp_num);
+	return (qp);
+}
+
+/**
+ * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
+ * @qp: NTB transport layer queue to be enabled
+ *
+ * Notify NTB transport layer of client readiness to use queue
+ */
+void
+ntb_transport_link_up(struct ntb_transport_qp *qp)
+{
+	struct ntb_transport_ctx *nt = qp->transport;
+
+	qp->client_ready = true;
+
+	ntb_printf(2, "qp %d client ready\n", qp->qp_num);
+
+	if (nt->link_is_up)
+		callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+}
+
+
+
+/* Transport Tx */
+
+/**
+ * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
+ * @qp: NTB transport layer queue the entry is to be enqueued on
+ * @cb: per buffer pointer for callback function to use
+ * @data: pointer to data buffer that will be sent
+ * @len: length of the data buffer
+ *
+ * Enqueue a new transmit buffer onto the transport queue from which a NTB
+ * payload will be transmitted.  This assumes that a lock is being held to
+ * serialize access to the qp.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int
+ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+    unsigned int len)
+{
+	struct ntb_queue_entry *entry;
+	int rc;
+
+	if (!qp->link_is_up || len == 0) {
+		CTR0(KTR_NTB, "TX: link not up");
+		return (EINVAL);
+	}
+
+	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
+	if (entry == NULL) {
+		CTR0(KTR_NTB, "TX: could not get entry from tx_free_q");
+		qp->tx_err_no_buf++;
+		return (EBUSY);
+	}
+	CTR1(KTR_NTB, "TX: got entry %p from tx_free_q", entry);
+
+	entry->cb_data = cb;
+	entry->buf = data;
+	entry->len = len;
+	entry->flags = 0;
+
+	mtx_lock(&qp->tx_lock);
+	rc = ntb_process_tx(qp, entry);
+	mtx_unlock(&qp->tx_lock);
+	if (rc != 0) {
+		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+		CTR1(KTR_NTB,
+		    "TX: process_tx failed. Returning entry %p to tx_free_q",
+		    entry);
+	}
+	return (rc);
+}
+
+static void
+ntb_tx_copy_callback(void *data)
+{
+	struct ntb_queue_entry *entry = data;
+	struct ntb_transport_qp *qp = entry->qp;
+	struct ntb_payload_header *hdr = entry->x_hdr;
+
+	iowrite32(entry->flags | NTBT_DESC_DONE_FLAG, &hdr->flags);
+	CTR1(KTR_NTB, "TX: hdr %p set DESC_DONE", hdr);
+
+	ntb_peer_db_set(qp->dev, 1ull << qp->qp_num);
+
+	/*
+	 * The entry length can only be zero if the packet is intended to be a
+	 * "link down" or similar.  Since no payload is being sent in these
+	 * cases, there is nothing to add to the completion queue.
+	 */
+	if (entry->len > 0) {
+		qp->tx_bytes += entry->len;
+
+		if (qp->tx_handler)
+			qp->tx_handler(qp, qp->cb_data, entry->buf,
+			    entry->len);
+		else
+			m_freem(entry->buf);
+		entry->buf = NULL;
+	}
+
+	CTR3(KTR_NTB,
+	    "TX: entry %p sent. hdr->ver = %u, hdr->flags = 0x%x, Returning "
+	    "to tx_free_q", entry, hdr->ver, hdr->flags);
+	ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+}
+
+static void
+ntb_memcpy_tx(struct ntb_queue_entry *entry, void *offset)
+{
+
+	CTR2(KTR_NTB, "TX: copying %d bytes to offset %p", entry->len, offset);
+	if (entry->buf != NULL) {
+		m_copydata((struct mbuf *)entry->buf, 0, entry->len, offset);
+
+		/*
+		 * Ensure that the data is fully copied before setting the
+		 * flags
+		 */
+		wmb();
+	}
+
+	ntb_tx_copy_callback(entry);
+}
+
+static void
+ntb_async_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
+{
+	struct ntb_payload_header *hdr;
+	void *offset;
+
+	offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
+	hdr = (struct ntb_payload_header *)((char *)offset + qp->tx_max_frame -
+	    sizeof(struct ntb_payload_header));
+	entry->x_hdr = hdr;
+
+	iowrite32(entry->len, &hdr->len);
+	iowrite32(qp->tx_pkts, &hdr->ver);
+
+	ntb_memcpy_tx(entry, offset);
+}
+
+static int
+ntb_process_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
+{
+
+	CTR3(KTR_NTB,
+	    "TX: process_tx: tx_pkts=%lu, tx_index=%u, remote entry=%u",
+	    qp->tx_pkts, qp->tx_index, qp->remote_rx_info->entry);
+	if (qp->tx_index == qp->remote_rx_info->entry) {
+		CTR0(KTR_NTB, "TX: ring full");
+		qp->tx_ring_full++;
+		return (EAGAIN);
+	}
+
+	if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
+		if (qp->tx_handler != NULL)
+			qp->tx_handler(qp, qp->cb_data, entry->buf,
+			    EIO);
+		else
+			m_freem(entry->buf);
+
+		entry->buf = NULL;
+		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+		CTR1(KTR_NTB,
+		    "TX: frame too big. returning entry %p to tx_free_q",
+		    entry);
+		return (0);
+	}
+	CTR2(KTR_NTB, "TX: copying entry %p to index %u", entry, qp->tx_index);
+	ntb_async_tx(qp, entry);
+
+	qp->tx_index++;
+	qp->tx_index %= qp->tx_max_entry;
+
+	qp->tx_pkts++;
+
+	return (0);
+}
+
+/* Transport Rx */
+static void
+ntb_transport_rxc_db(void *arg, int pending __unused)
+{
+	struct ntb_transport_qp *qp = arg;
+	int rc;
+
+	CTR0(KTR_NTB, "RX: transport_rx");
+again:
+	while ((rc = ntb_process_rxc(qp)) == 0)
+		;
+	CTR1(KTR_NTB, "RX: process_rxc returned %d", rc);
+
+	if ((ntb_db_read(qp->dev) & (1ull << qp->qp_num)) != 0) {
+		/* If db is set, clear it and check queue once more. */
+		ntb_db_clear(qp->dev, 1ull << qp->qp_num);
+		goto again;
+	}
+}
+
+static int
+ntb_process_rxc(struct ntb_transport_qp *qp)
+{
+	struct ntb_payload_header *hdr;
+	struct ntb_queue_entry *entry;
+	caddr_t offset;
+
+	offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
+	hdr = (void *)(offset + qp->rx_max_frame -
+	    sizeof(struct ntb_payload_header));
+
+	CTR1(KTR_NTB, "RX: process_rxc rx_index = %u", qp->rx_index);
+	if ((hdr->flags & NTBT_DESC_DONE_FLAG) == 0) {
+		CTR0(KTR_NTB, "RX: hdr not done");
+		qp->rx_ring_empty++;
+		return (EAGAIN);
+	}
+
+	if ((hdr->flags & NTBT_LINK_DOWN_FLAG) != 0) {
+		CTR0(KTR_NTB, "RX: link down");
+		ntb_qp_link_down(qp);
+		hdr->flags = 0;
+		return (EAGAIN);
+	}
+
+	if (hdr->ver != (uint32_t)qp->rx_pkts) {
+		CTR2(KTR_NTB,"RX: ver != rx_pkts (%x != %lx). "
+		    "Returning entry to rx_pend_q", hdr->ver, qp->rx_pkts);
+		qp->rx_err_ver++;
+		return (EIO);
+	}
+
+	entry = ntb_list_mv(&qp->ntb_rx_q_lock, &qp->rx_pend_q, &qp->rx_post_q);
+	if (entry == NULL) {
+		qp->rx_err_no_buf++;
+		CTR0(KTR_NTB, "RX: No entries in rx_pend_q");
+		return (EAGAIN);
+	}
+	callout_stop(&qp->rx_full);
+	CTR1(KTR_NTB, "RX: rx entry %p from rx_pend_q", entry);
+
+	entry->x_hdr = hdr;
+	entry->index = qp->rx_index;
+
+	if (hdr->len > entry->len) {
+		CTR2(KTR_NTB, "RX: len too long. Wanted %ju got %ju",
+		    (uintmax_t)hdr->len, (uintmax_t)entry->len);
+		qp->rx_err_oflow++;
+
+		entry->len = -EIO;
+		entry->flags |= NTBT_DESC_DONE_FLAG;
+
+		ntb_complete_rxc(qp);
+	} else {
+		qp->rx_bytes += hdr->len;
+		qp->rx_pkts++;
+
+		CTR1(KTR_NTB, "RX: received %ld rx_pkts", qp->rx_pkts);
+
+		entry->len = hdr->len;
+
+		ntb_memcpy_rx(qp, entry, offset);
+	}
+
+	qp->rx_index++;
+	qp->rx_index %= qp->rx_max_entry;
+	return (0);
+}
+
+static void
+ntb_memcpy_rx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
+    void *offset)
+{
+	struct ifnet *ifp = entry->cb_data;
+	unsigned int len = entry->len;
+
+	CTR2(KTR_NTB, "RX: copying %d bytes from offset %p", len, offset);
+
+	entry->buf = (void *)m_devget(offset, len, 0, ifp, NULL);
+	if (entry->buf == NULL)
+		entry->len = -ENOMEM;
+
+	/* Ensure that the data is globally visible before clearing the flag */
+	wmb();
+
+	CTR2(KTR_NTB, "RX: copied entry %p to mbuf %p.", entry, entry->buf);
+	ntb_rx_copy_callback(qp, entry);
+}
+
+static inline void
+ntb_rx_copy_callback(struct ntb_transport_qp *qp, void *data)
+{
+	struct ntb_queue_entry *entry;
+
+	entry = data;
+	entry->flags |= NTBT_DESC_DONE_FLAG;
+	ntb_complete_rxc(qp);
+}
+
+static void
+ntb_complete_rxc(struct ntb_transport_qp *qp)
+{
+	struct ntb_queue_entry *entry;
+	struct mbuf *m;
+	unsigned len;
+
+	CTR0(KTR_NTB, "RX: rx_completion_task");
+
+	mtx_lock_spin(&qp->ntb_rx_q_lock);
+
+	while (!STAILQ_EMPTY(&qp->rx_post_q)) {
+		entry = STAILQ_FIRST(&qp->rx_post_q);
+		if ((entry->flags & NTBT_DESC_DONE_FLAG) == 0)
+			break;
+
+		entry->x_hdr->flags = 0;
+		iowrite32(entry->index, &qp->rx_info->entry);
+
+		STAILQ_REMOVE_HEAD(&qp->rx_post_q, entry);
+
+		len = entry->len;
+		m = entry->buf;
+
+		/*
+		 * Re-initialize queue_entry for reuse; rx_handler takes
+		 * ownership of the mbuf.
+		 */
+		entry->buf = NULL;
+		entry->len = transport_mtu;
+		entry->cb_data = qp->cb_data;
+
+		STAILQ_INSERT_TAIL(&qp->rx_pend_q, entry, entry);
+
+		mtx_unlock_spin(&qp->ntb_rx_q_lock);
+
+		CTR2(KTR_NTB, "RX: completing entry %p, mbuf %p", entry, m);
+		if (qp->rx_handler != NULL && qp->client_ready)
+			qp->rx_handler(qp, qp->cb_data, m, len);
+		else
+			m_freem(m);
+
+		mtx_lock_spin(&qp->ntb_rx_q_lock);
+	}
+
+	mtx_unlock_spin(&qp->ntb_rx_q_lock);
+}
+
+static void
+ntb_transport_doorbell_callback(void *data, uint32_t vector)
+{
+	struct ntb_transport_ctx *nt = data;
+	struct ntb_transport_qp *qp;
+	uint64_t vec_mask;
+	unsigned qp_num;
+
+	vec_mask = ntb_db_vector_mask(nt->dev, vector);
+	vec_mask &= nt->qp_bitmap;
+	if ((vec_mask & (vec_mask - 1)) != 0)
+		vec_mask &= ntb_db_read(nt->dev);
+	while (vec_mask != 0) {
+		qp_num = ffsll(vec_mask) - 1;
+
+		qp = &nt->qp_vec[qp_num];
+		if (qp->link_is_up)
+			taskqueue_enqueue(qp->rxc_tq, &qp->rxc_db_work);
+
+		vec_mask &= ~(1ull << qp_num);
+	}
+}
+
+/* Link Event handler */
+static void
+ntb_transport_event_callback(void *data)
+{
+	struct ntb_transport_ctx *nt = data;
+
+	if (ntb_link_is_up(nt->dev, NULL, NULL)) {
+		ntb_printf(1, "HW link up\n");
+		callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
+	} else {
+		ntb_printf(1, "HW link down\n");
+		taskqueue_enqueue(taskqueue_swi, &nt->link_cleanup);
+	}
+}
+
+/* Link bring up */
+static void
+ntb_transport_link_work(void *arg)
+{
+	struct ntb_transport_ctx *nt = arg;
+	device_t dev = nt->dev;
+	struct ntb_transport_qp *qp;
+	uint64_t val64, size;
+	uint32_t val;
+	unsigned i;
+	int rc;
+
+	/* send the local info, in the opposite order of the way we read it */
+	for (i = 0; i < nt->mw_count; i++) {
+		size = nt->mw_vec[i].phys_size;
+
+		if (max_mw_size != 0 && size > max_mw_size)
+			size = max_mw_size;
+
+		ntb_peer_spad_write(dev, NTBT_MW0_SZ_HIGH + (i * 2),
+		    size >> 32);
+		ntb_peer_spad_write(dev, NTBT_MW0_SZ_LOW + (i * 2), size);
+	}
+	ntb_peer_spad_write(dev, NTBT_NUM_MWS, nt->mw_count);
+	ntb_peer_spad_write(dev, NTBT_NUM_QPS, nt->qp_count);
+	ntb_peer_spad_write(dev, NTBT_QP_LINKS, 0);
+	ntb_peer_spad_write(dev, NTBT_VERSION, NTB_TRANSPORT_VERSION);
+
+	/* Query the remote side for its info */
+	val = 0;
+	ntb_spad_read(dev, NTBT_VERSION, &val);
+	if (val != NTB_TRANSPORT_VERSION)
+		goto out;
+
+	ntb_spad_read(dev, NTBT_NUM_QPS, &val);
+	if (val != nt->qp_count)
+		goto out;
+
+	ntb_spad_read(dev, NTBT_NUM_MWS, &val);
+	if (val != nt->mw_count)
+		goto out;
+
+	for (i = 0; i < nt->mw_count; i++) {
+		ntb_spad_read(dev, NTBT_MW0_SZ_HIGH + (i * 2), &val);
+		val64 = (uint64_t)val << 32;
+
+		ntb_spad_read(dev, NTBT_MW0_SZ_LOW + (i * 2), &val);
+		val64 |= val;
+
+		rc = ntb_set_mw(nt, i, val64);
+		if (rc != 0)
+			goto free_mws;
+	}
+
+	nt->link_is_up = true;
+	ntb_printf(1, "transport link up\n");
+
+	for (i = 0; i < nt->qp_count; i++) {
+		qp = &nt->qp_vec[i];
+
+		ntb_transport_setup_qp_mw(nt, i);
+
+		if (qp->client_ready)
+			callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+	}
+
+	return;
+
+free_mws:
+	for (i = 0; i < nt->mw_count; i++)
+		ntb_free_mw(nt, i);
+out:
+	if (ntb_link_is_up(dev, NULL, NULL))
+		callout_reset(&nt->link_work,
+		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_transport_link_work, nt);
+}
+
+static int
+ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, size_t size)
+{
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	size_t xlat_size, buff_size;
+	int rc;
+
+	if (size == 0)
+		return (EINVAL);
+
+	xlat_size = roundup(size, mw->xlat_align_size);
+	buff_size = xlat_size;
+
+	/* No need to re-setup */
+	if (mw->xlat_size == xlat_size)
+		return (0);
+
+	if (mw->buff_size != 0)
+		ntb_free_mw(nt, num_mw);
+
+	/* Alloc memory for receiving data.  Must be aligned */
+	mw->xlat_size = xlat_size;
+	mw->buff_size = buff_size;
+
+	mw->virt_addr = contigmalloc(mw->buff_size, M_NTB_T, M_ZERO, 0,
+	    mw->addr_limit, mw->xlat_align, 0);
+	if (mw->virt_addr == NULL) {
+		ntb_printf(0, "Unable to allocate MW buffer of size %zu/%zu\n",
+		    mw->buff_size, mw->xlat_size);
+		mw->xlat_size = 0;
+		mw->buff_size = 0;
+		return (ENOMEM);
+	}
+	/* TODO: replace with bus_space_* functions */
+	mw->dma_addr = vtophys(mw->virt_addr);
+
+	/*
+	 * Ensure that the allocation from contigmalloc is aligned as
+	 * requested.  XXX: This may not be needed -- brought in for parity
+	 * with the Linux driver.
+	 */
+	if (mw->dma_addr % mw->xlat_align != 0) {
+		ntb_printf(0,
+		    "DMA memory 0x%jx not aligned to BAR size 0x%zx\n",
+		    (uintmax_t)mw->dma_addr, size);
+		ntb_free_mw(nt, num_mw);
+		return (ENOMEM);
+	}
+
+	/* Notify HW the memory location of the receive buffer */
+	rc = ntb_mw_set_trans(nt->dev, num_mw, mw->dma_addr, mw->xlat_size);
+	if (rc) {
+		ntb_printf(0, "Unable to set mw%d translation\n", num_mw);
+		ntb_free_mw(nt, num_mw);
+		return (rc);
+	}
+
+	return (0);
+}
+
+static void
+ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
+{
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+
+	if (mw->virt_addr == NULL)
+		return;
+
+	ntb_mw_clear_trans(nt->dev, num_mw);
+	contigfree(mw->virt_addr, mw->xlat_size, M_NTB_T);
+	mw->xlat_size = 0;
+	mw->buff_size = 0;
+	mw->virt_addr = NULL;
+}
+
+static int
+ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, unsigned int qp_num)
+{
+	struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
+	struct ntb_transport_mw *mw;
+	void *offset;
+	ntb_q_idx_t i;
+	size_t rx_size;
+	unsigned num_qps_mw, mw_num, mw_count;
+
+	mw_count = nt->mw_count;
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	if (mw->virt_addr == NULL)
+		return (ENOMEM);
+
+	if (mw_num < nt->qp_count % mw_count)
+		num_qps_mw = nt->qp_count / mw_count + 1;
+	else
+		num_qps_mw = nt->qp_count / mw_count;
+
+	rx_size = mw->xlat_size / num_qps_mw;
+	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
+	rx_size -= sizeof(struct ntb_rx_info);
+
+	qp->remote_rx_info = (void*)(qp->rx_buff + rx_size);
+
+	/* Due to house-keeping, there must be at least 2 buffs */
+	qp->rx_max_frame = qmin(transport_mtu, rx_size / 2);
+	qp->rx_max_entry = rx_size / qp->rx_max_frame;
+	qp->rx_index = 0;
+
+	qp->remote_rx_info->entry = qp->rx_max_entry - 1;
+
+	/* Set up the hdr offsets with 0s */
+	for (i = 0; i < qp->rx_max_entry; i++) {
+		offset = (void *)(qp->rx_buff + qp->rx_max_frame * (i + 1) -
+		    sizeof(struct ntb_payload_header));
+		memset(offset, 0, sizeof(struct ntb_payload_header));
+	}
+
+	qp->rx_pkts = 0;
+	qp->tx_pkts = 0;
+	qp->tx_index = 0;
+
+	return (0);
+}
+
+static void
+ntb_qp_link_work(void *arg)
+{
+	struct ntb_transport_qp *qp = arg;
+	device_t dev = qp->dev;
+	struct ntb_transport_ctx *nt = qp->transport;
+	int i;
+	uint32_t val;
+
+	/* Report queues that are up on our side */
+	for (i = 0, val = 0; i < nt->qp_count; i++) {
+		if (nt->qp_vec[i].client_ready)
+			val |= (1 << i);
+	}
+	ntb_peer_spad_write(dev, NTBT_QP_LINKS, val);
+
+	/* See if the remote side is up */
+	ntb_spad_read(dev, NTBT_QP_LINKS, &val);
+	if ((val & (1ull << qp->qp_num)) != 0) {
+		ntb_printf(2, "qp %d link up\n", qp->qp_num);
+		qp->link_is_up = true;
+
+		if (qp->event_handler != NULL)
+			qp->event_handler(qp->cb_data, NTB_LINK_UP);
+
+		ntb_db_clear_mask(dev, 1ull << qp->qp_num);
+	} else if (nt->link_is_up)
+		callout_reset(&qp->link_work,
+		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
+}
+
+/* Link down event*/
+static void
+ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
+{
+	struct ntb_transport_qp *qp;
+	int i;
+
+	/* Pass along the info to any clients */
+	for (i = 0; i < nt->qp_count; i++) {
+		if ((nt->qp_bitmap & (1 << i)) != 0) {
+			qp = &nt->qp_vec[i];
+			ntb_qp_link_cleanup(qp);
+			callout_drain(&qp->link_work);
+		}
+	}
+
+	if (!nt->link_is_up)
+		callout_drain(&nt->link_work);
+
+	/*
+	 * The scratchpad registers keep the values if the remote side
+	 * goes down, blast them now to give them a sane value the next
+	 * time they are accessed
+	 */
+	ntb_spad_clear(nt->dev);
+}
+
+static void
+ntb_transport_link_cleanup_work(void *arg, int pending __unused)
+{
+
+	ntb_transport_link_cleanup(arg);
+}
+
+static void
+ntb_qp_link_down(struct ntb_transport_qp *qp)
+{
+
+	ntb_qp_link_cleanup(qp);
+}
+
+static void
+ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
+{
+
+	qp->link_is_up = false;
+	ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
+
+	qp->tx_index = qp->rx_index = 0;
+	qp->tx_bytes = qp->rx_bytes = 0;
+	qp->tx_pkts = qp->rx_pkts = 0;
+
+	qp->rx_ring_empty = 0;
+	qp->tx_ring_full = 0;
+
+	qp->rx_err_no_buf = qp->tx_err_no_buf = 0;
+	qp->rx_err_oflow = qp->rx_err_ver = 0;
+}
+
+static void
+ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
+{
+
+	callout_drain(&qp->link_work);
+	ntb_qp_link_down_reset(qp);
+
+	if (qp->event_handler != NULL)
+		qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
+}
+
+/* Link commanded down */
+/**
+ * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
+ * @qp: NTB transport layer queue to be disabled
+ *
+ * Notify NTB transport layer of client's desire to no longer receive data on
+ * transport queue specified.  It is the client's responsibility to ensure all
+ * entries on queue are purged or otherwise handled appropriately.
+ */
+void
+ntb_transport_link_down(struct ntb_transport_qp *qp)
+{
+	struct ntb_transport_ctx *nt = qp->transport;
+	int i;
+	uint32_t val;
+
+	qp->client_ready = false;
+	for (i = 0, val = 0; i < nt->qp_count; i++) {
+		if (nt->qp_vec[i].client_ready)
+			val |= (1 << i);
+	}
+	ntb_peer_spad_write(qp->dev, NTBT_QP_LINKS, val);
+
+	if (qp->link_is_up)
+		ntb_send_link_down(qp);
+	else
+		callout_drain(&qp->link_work);
+}
+
+/**
+ * ntb_transport_link_query - Query transport link state
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query connectivity to the remote system of the NTB transport queue
+ *
+ * RETURNS: true for link up or false for link down
+ */
+bool
+ntb_transport_link_query(struct ntb_transport_qp *qp)
+{
+
+	return (qp->link_is_up);
+}
+
+static void
+ntb_send_link_down(struct ntb_transport_qp *qp)
+{
+	struct ntb_queue_entry *entry;
+	int i, rc;
+
+	if (!qp->link_is_up)
+		return;
+
+	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
+		entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
+		if (entry != NULL)
+			break;
+		pause("NTB Wait for link down", hz / 10);
+	}
+
+	if (entry == NULL)
+		return;
+
+	entry->cb_data = NULL;
+	entry->buf = NULL;
+	entry->len = 0;
+	entry->flags = NTBT_LINK_DOWN_FLAG;
+
+	mtx_lock(&qp->tx_lock);
+	rc = ntb_process_tx(qp, entry);
+	mtx_unlock(&qp->tx_lock);
+	if (rc != 0)
+		printf("ntb: Failed to send link down\n");
+
+	ntb_qp_link_down_reset(qp);
+}
+
+
+/* List Management */
+
+static void
+ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
+    struct ntb_queue_list *list)
+{
+
+	mtx_lock_spin(lock);
+	STAILQ_INSERT_TAIL(list, entry, entry);
+	mtx_unlock_spin(lock);
+}
+
+static struct ntb_queue_entry *
+ntb_list_rm(struct mtx *lock, struct ntb_queue_list *list)
+{
+	struct ntb_queue_entry *entry;
+
+	mtx_lock_spin(lock);
+	if (STAILQ_EMPTY(list)) {
+		entry = NULL;
+		goto out;
+	}
+	entry = STAILQ_FIRST(list);
+	STAILQ_REMOVE_HEAD(list, entry);
+out:
+	mtx_unlock_spin(lock);
+
+	return (entry);
+}
+
+static struct ntb_queue_entry *
+ntb_list_mv(struct mtx *lock, struct ntb_queue_list *from,
+    struct ntb_queue_list *to)
+{
+	struct ntb_queue_entry *entry;
+
+	mtx_lock_spin(lock);
+	if (STAILQ_EMPTY(from)) {
+		entry = NULL;
+		goto out;
+	}
+	entry = STAILQ_FIRST(from);
+	STAILQ_REMOVE_HEAD(from, entry);
+	STAILQ_INSERT_TAIL(to, entry, entry);
+
+out:
+	mtx_unlock_spin(lock);
+	return (entry);
+}
+
+/**
+ * ntb_transport_qp_num - Query the qp number
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query qp number of the NTB transport queue
+ *
+ * RETURNS: a zero based number specifying the qp number
+ */
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp)
+{
+
+	return (qp->qp_num);
+}
+
+/**
+ * ntb_transport_max_size - Query the max payload size of a qp
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query the maximum payload size permissible on the given qp
+ *
+ * RETURNS: the max payload size of a qp
+ */
+unsigned int
+ntb_transport_max_size(struct ntb_transport_qp *qp)
+{
+
+	return (qp->tx_max_frame - sizeof(struct ntb_payload_header));
+}
+
+unsigned int
+ntb_transport_tx_free_entry(struct ntb_transport_qp *qp)
+{
+	unsigned int head = qp->tx_index;
+	unsigned int tail = qp->remote_rx_info->entry;
+
+	return (tail >= head ? tail - head : qp->tx_max_entry + tail - head);
+}
+
+static device_method_t ntb_transport_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,     ntb_transport_probe),
+	DEVMETHOD(device_attach,    ntb_transport_attach),
+	DEVMETHOD(device_detach,    ntb_transport_detach),
+	DEVMETHOD_END
+};
+
+devclass_t ntb_transport_devclass;
+static DEFINE_CLASS_0(ntb_transport, ntb_transport_driver,
+    ntb_transport_methods, sizeof(struct ntb_transport_ctx));
+DRIVER_MODULE(ntb_transport, ntb_hw, ntb_transport_driver,
+    ntb_transport_devclass, NULL, NULL);
+MODULE_DEPEND(ntb_transport, ntb, 1, 1, 1);
+MODULE_VERSION(ntb_transport, 1);
diff --git a/sys/dev/ntb/ntb_transport.h b/sys/dev/ntb/ntb_transport.h
new file mode 100644
index 0000000..63cdbce
--- /dev/null
+++ b/sys/dev/ntb/ntb_transport.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+struct ntb_transport_qp;
+
+extern devclass_t ntb_transport_devclass;
+
+enum ntb_link_event {
+	NTB_LINK_DOWN = 0,
+	NTB_LINK_UP,
+};
+
+struct ntb_queue_handlers {
+	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+	    void *data, int len);
+	void (*event_handler)(void *data, enum ntb_link_event status);
+};
+
+int ntb_transport_queue_count(device_t dev);
+struct ntb_transport_qp *
+ntb_transport_create_queue(device_t dev, int q,
+    const struct ntb_queue_handlers *handlers, void *data);
+void ntb_transport_free_queue(struct ntb_transport_qp *qp);
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
+int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
+void ntb_transport_link_up(struct ntb_transport_qp *qp);
+void ntb_transport_link_down(struct ntb_transport_qp *qp);
+bool ntb_transport_link_query(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp);
diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index 4ceb075..d70aaad 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -1744,7 +1744,7 @@ pci_remap_msix_method(device_t dev, device_t child, int count,
 	for (i = 0; i < count; i++) {
 		if (vectors[i] == 0)
 			continue;
-		irq = msix->msix_vectors[vectors[i]].mv_irq;
+		irq = msix->msix_vectors[vectors[i] - 1].mv_irq;
 		resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq,
 		    irq, 1);
 	}
@@ -1758,7 +1758,7 @@ pci_remap_msix_method(device_t dev, device_t child, int count,
 				printf("---");
 			else
 				printf("%d",
-				    msix->msix_vectors[vectors[i]].mv_irq);
+				    msix->msix_vectors[vectors[i] - 1].mv_irq);
 		}
 		printf("\n");
 	}
diff --git a/sys/dev/sfxge/sfxge_ev.c b/sys/dev/sfxge/sfxge_ev.c
index d5aff5f..06ffed2 100644
--- a/sys/dev/sfxge/sfxge_ev.c
+++ b/sys/dev/sfxge/sfxge_ev.c
@@ -448,7 +448,7 @@ sfxge_ev_stat_update(struct sfxge_softc *sc)
 		goto out;
 
 	now = ticks;
-	if (now - sc->ev_stats_update_time < hz)
+	if ((unsigned int)(now - sc->ev_stats_update_time) < (unsigned int)hz)
 		goto out;
 
 	sc->ev_stats_update_time = now;
diff --git a/sys/dev/sfxge/sfxge_port.c b/sys/dev/sfxge/sfxge_port.c
index 709ed78..a4f671f 100644
--- a/sys/dev/sfxge/sfxge_port.c
+++ b/sys/dev/sfxge/sfxge_port.c
@@ -62,7 +62,7 @@ sfxge_mac_stat_update(struct sfxge_softc *sc)
 	}
 
 	now = ticks;
-	if (now - port->mac_stats.update_time < hz) {
+	if ((unsigned int)(now - port->mac_stats.update_time) < (unsigned int)hz) {
 		rc = 0;
 		goto out;
 	}
@@ -543,7 +543,7 @@ sfxge_phy_stat_update(struct sfxge_softc *sc)
 	}
 
 	now = ticks;
-	if (now - port->phy_stats.update_time < hz) {
+	if ((unsigned int)(now - port->phy_stats.update_time) < (unsigned int)hz) {
 		rc = 0;
 		goto out;
 	}
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 4d96840..8fa6bcd 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -728,6 +728,7 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
 	if (flags & RFPPWAIT) {
 		td->td_pflags |= TDP_RFPPWAIT;
 		td->td_rfppwait_p = p2;
+		td->td_dbgflags |= TDB_VFORK;
 	}
 	PROC_UNLOCK(p2);
 	if ((flags & RFSTOPPED) == 0) {
@@ -1063,7 +1064,7 @@ fork_return(struct thread *td, struct trapframe *frame)
 			 * parent's children, do it now.
 			 */
 			dbg = p->p_pptr->p_pptr;
-			proc_set_traced(p);
+			proc_set_traced(p, true);
 			CTR2(KTR_PTRACE,
 		    "fork_return: attaching to new child pid %d: oppid %d",
 			    p->p_pid, p->p_oppid);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 2c37d76..75121b5 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -2510,7 +2510,7 @@ ptracestop(struct thread *td, int sig)
 		 * a chance to report itself upon the next iteration.
 		 */
 		if ((td->td_dbgflags & TDB_FSTP) != 0 ||
-		    ((p->p_flag & P2_PTRACE_FSTP) == 0 &&
+		    ((p->p_flag2 & P2_PTRACE_FSTP) == 0 &&
 		    p->p_xthread == NULL)) {
 			p->p_xstat = sig;
 			p->p_xthread = td;
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
index f2b83f0..201d876 100644
--- a/sys/kern/subr_syscall.c
+++ b/sys/kern/subr_syscall.c
@@ -249,5 +249,13 @@ again:
 			cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz);
 		}
 		PROC_UNLOCK(p2);
+
+		if (td->td_dbgflags & TDB_VFORK) {
+			PROC_LOCK(p);
+			if (p->p_ptevents & PTRACE_VFORK)
+				ptracestop(td, SIGTRAP);
+			td->td_dbgflags &= ~TDB_VFORK;
+			PROC_UNLOCK(p);
+		}
 	}
 }
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index c4533ce..b2dbf72 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -649,12 +649,13 @@ sys_ptrace(struct thread *td, struct ptrace_args *uap)
 #endif
 
 void
-proc_set_traced(struct proc *p)
+proc_set_traced(struct proc *p, bool stop)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_TRACED;
-	p->p_flag2 |= P2_PTRACE_FSTP;
+	if (stop)
+		p->p_flag2 |= P2_PTRACE_FSTP;
 	p->p_ptevents = PTRACE_DEFAULT;
 	p->p_oppid = p->p_pptr->p_pid;
 }
@@ -867,7 +868,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 	switch (req) {
 	case PT_TRACE_ME:
 		/* set my trace flag and "owner" so it can read/write me */
-		proc_set_traced(p);
+		proc_set_traced(p, false);
 		if (p->p_flag & P_PPWAIT)
 			p->p_flag |= P_PPTRACE;
 		CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid);
@@ -884,7 +885,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 		 * The old parent is remembered so we can put things back
 		 * on a "detach".
 		 */
-		proc_set_traced(p);
+		proc_set_traced(p, true);
 		if (p->p_pptr != td->td_proc) {
 			proc_reparent(p, td->td_proc);
 		}
@@ -957,7 +958,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 		}
 		tmp = *(int *)addr;
 		if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX |
-		    PTRACE_FORK | PTRACE_LWP)) != 0) {
+		    PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) {
 			error = EINVAL;
 			break;
 		}
@@ -1296,7 +1297,11 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 		if (td2->td_dbgflags & TDB_FORK) {
 			pl->pl_flags |= PL_FLAG_FORKED;
 			pl->pl_child_pid = td2->td_dbg_forked;
-		}
+			if (td2->td_dbgflags & TDB_VFORK)
+				pl->pl_flags |= PL_FLAG_VFORKED;
+		} else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) ==
+		    TDB_VFORK)
+			pl->pl_flags |= PL_FLAG_VFORK_DONE;
 		if (td2->td_dbgflags & TDB_CHILD)
 			pl->pl_flags |= PL_FLAG_CHILD;
 		if (td2->td_dbgflags & TDB_BORN)
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 89b7a00..0fa87f9 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -1582,7 +1582,7 @@ static struct aiocb_ops aiocb_ops_osigevent = {
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
-	int type, struct aiocb_ops *ops)
+    int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	cap_rights_t rights;
@@ -2568,14 +2568,9 @@ static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
     struct aiocb_ops *ops)
 {
-	struct proc *p = td->td_proc;
-	struct kaioinfo *ki;
 
 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
-	ki = p->p_aioinfo;
-	if (ki == NULL)
-		aio_init_aioinfo(p);
 	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
 }
 
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 94b8149..166ed65 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -635,7 +635,6 @@ int
 vop_stdfsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
-		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
diff --git a/sys/modules/ntb/Makefile b/sys/modules/ntb/Makefile
index a5169a0..3eaf751 100644
--- a/sys/modules/ntb/Makefile
+++ b/sys/modules/ntb/Makefile
@@ -1,5 +1,5 @@
 # $FreeBSD$
 
-SUBDIR=	ntb_hw if_ntb
+SUBDIR=	ntb ntb_hw ntb_transport if_ntb
 
 .include <bsd.subdir.mk>
diff --git a/sys/modules/ntb/ntb/Makefile b/sys/modules/ntb/ntb/Makefile
new file mode 100644
index 0000000..a343f28
--- /dev/null
+++ b/sys/modules/ntb/ntb/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:  ${.CURDIR}/../../../dev/ntb
+
+KMOD    = ntb
+SRCS    = ntb.c ntb_if.c
+SRCS += device_if.h bus_if.h ntb_if.h
+
+MFILES=	kern/bus_if.m kern/device_if.m dev/ntb/ntb_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/ntb/ntb_hw/Makefile b/sys/modules/ntb/ntb_hw/Makefile
index fc46b46..5240411 100644
--- a/sys/modules/ntb/ntb_hw/Makefile
+++ b/sys/modules/ntb/ntb_hw/Makefile
@@ -4,6 +4,8 @@
 
 KMOD    = ntb_hw
 SRCS    = ntb_hw.c
-SRCS += device_if.h bus_if.h pci_if.h
+SRCS += device_if.h bus_if.h pci_if.h ntb_if.h
+
+MFILES=	kern/bus_if.m kern/device_if.m dev/pci/pci_if.m dev/ntb/ntb_if.m
 
 .include <bsd.kmod.mk>
diff --git a/sys/modules/ntb/ntb_transport/Makefile b/sys/modules/ntb/ntb_transport/Makefile
new file mode 100644
index 0000000..5055600
--- /dev/null
+++ b/sys/modules/ntb/ntb_transport/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:  ${.CURDIR}/../../../dev/ntb
+
+KMOD    = ntb_transport
+SRCS    = ntb_transport.c
+SRCS += device_if.h bus_if.h ntb_if.h
+
+MFILES=	kern/bus_if.m kern/device_if.m dev/ntb/ntb_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index 14d9967..57aadc0 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -165,7 +165,8 @@ __FBSDID("$FreeBSD$");
 /*
  * List of capabilities to possibly mask on the member interface.
  */
-#define	BRIDGE_IFCAPS_MASK		(IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM)
+#define	BRIDGE_IFCAPS_MASK		(IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM|\
+					 IFCAP_TXCSUM_IPV6)
 
 /*
  * List of capabilities to strip
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index a1ceaab..8c56a13 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -2758,8 +2758,8 @@ pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
 	switch (af) {
 #ifdef INET
 	case AF_INET:
-		if ((a->addr32[0] < b->addr32[0]) ||
-		    (a->addr32[0] > e->addr32[0]))
+		if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
+		    (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
 			return (0);
 		break;
 #endif /* INET */
@@ -2769,15 +2769,15 @@ pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
 
 		/* check a >= b */
 		for (i = 0; i < 4; ++i)
-			if (a->addr32[i] > b->addr32[i])
+			if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
 				break;
-			else if (a->addr32[i] < b->addr32[i])
+			else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
 				return (0);
 		/* check a <= e */
 		for (i = 0; i < 4; ++i)
-			if (a->addr32[i] < e->addr32[i])
+			if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
 				break;
-			else if (a->addr32[i] > e->addr32[i])
+			else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
 				return (0);
 		break;
 	}
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 4b6c601..07f69c6 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -58,7 +58,7 @@
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1003505	/* Master, propagated to newvers */
+#define __FreeBSD_version 1003506	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 1b8bda5..59c75c5 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -398,6 +398,7 @@ do {									\
 #define	TDB_CHILD	0x00000100 /* New child indicator for ptrace() */
 #define	TDB_BORN	0x00000200 /* New LWP indicator for ptrace() */
 #define	TDB_EXIT	0x00000400 /* Exiting LWP indicator for ptrace() */
+#define	TDB_VFORK	0x00000800 /* vfork indicator for ptrace() */
 #define	TDB_FSTP	0x00001000 /* The thread is PT_ATTACH leader */
 
 /*
@@ -563,7 +564,7 @@ struct proc {
 	u_int		p_magic;	/* (b) Magic number. */
 	int		p_osrel;	/* (x) osreldate for the
 					       binary (from ELF note, if any) */
-	char		p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
+	char		p_comm[MAXCOMLEN + 1];	/* (x) Process name. */
 	void		*p_pad0;
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
@@ -932,7 +933,7 @@ void	proc_linkup(struct proc *p, struct thread *td);
 struct proc *proc_realparent(struct proc *child);
 void	proc_reap(struct thread *td, struct proc *p, int *status, int options);
 void	proc_reparent(struct proc *child, struct proc *newparent);
-void	proc_set_traced(struct proc *p);
+void	proc_set_traced(struct proc *p, bool stop);
 struct	pstats *pstats_alloc(void);
 void	pstats_fork(struct pstats *src, struct pstats *dst);
 void	pstats_free(struct pstats *ps);
diff --git a/sys/sys/ptrace.h b/sys/sys/ptrace.h
index e2b6a5f..f5f1db2 100644
--- a/sys/sys/ptrace.h
+++ b/sys/sys/ptrace.h
@@ -89,6 +89,7 @@
 #define	PTRACE_SYSCALL	(PTRACE_SCE | PTRACE_SCX)
 #define	PTRACE_FORK	0x0008
 #define	PTRACE_LWP	0x0010
+#define	PTRACE_VFORK	0x0020
 
 #define	PTRACE_DEFAULT	(PTRACE_EXEC)
 
@@ -124,6 +125,8 @@ struct ptrace_lwpinfo {
 #define	PL_FLAG_CHILD	0x80	/* I am from child */
 #define	PL_FLAG_BORN	0x100	/* new LWP */
 #define	PL_FLAG_EXITED	0x200	/* exiting LWP */
+#define	PL_FLAG_VFORKED	0x400	/* new child via vfork */
+#define	PL_FLAG_VFORK_DONE 0x800 /* vfork parent has resumed */
 	sigset_t	pl_sigmask;	/* LWP signal mask */
 	sigset_t	pl_siglist;	/* LWP pending signal */
 	struct __siginfo pl_siginfo;	/* siginfo for signal */
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index 8551085..04e0ae9 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -255,6 +255,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -309,7 +311,7 @@ retry:
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -325,6 +327,8 @@ retry:
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
@@ -386,7 +390,7 @@ retry:
 		    flags | IO_BUFLOCKED, cred, &newb);
 		if (error) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -401,6 +405,8 @@ retry:
 			goto fail;
 		}
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -478,10 +484,16 @@ fail:
 		 * We shall not leave the freed blocks on the vnode
 		 * buffer object lists.
 		 */
-		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
-			bp->b_flags |= (B_INVAL | B_RELBUF);
-			bp->b_flags &= ~B_ASYNC;
+			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
+			    ("mismatch1 l %jd %jd b %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
+			    (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp)));
+			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
+			bp->b_flags &= ~(B_ASYNC | B_CACHE);
 			brelse(bp);
 		}
 		deallocated += fs->fs_bsize;
@@ -524,6 +536,18 @@ fail:
 	 * cleared, free the blocks.
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
+#ifdef INVARIANTS
+		if (blkp == allociblk)
+			lbns_remfree = lbns;
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
+		if (bp != NULL) {
+			panic("zombie1 %jd %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp));
+		}
+		lbns_remfree++;
+#endif
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
@@ -818,6 +842,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
@@ -873,7 +899,7 @@ retry:
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -889,6 +915,8 @@ retry:
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
@@ -951,7 +979,7 @@ retry:
 		    flags | IO_BUFLOCKED, cred, &newb);
 		if (error) {
 			brelse(bp);
-			if (++reclaimed == 1) {
+			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
@@ -966,6 +994,8 @@ retry:
 			goto fail;
 		}
 		nb = newb;
+		MPASS(allocblk < allociblk + nitems(allociblk));
+		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -1049,10 +1079,16 @@ fail:
 		 * We shall not leave the freed blocks on the vnode
 		 * buffer object lists.
 		 */
-		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
-			bp->b_flags |= (B_INVAL | B_RELBUF);
-			bp->b_flags &= ~B_ASYNC;
+			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
+			    ("mismatch2 l %jd %jd b %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
+			    (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp)));
+			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
+			bp->b_flags &= ~(B_ASYNC | B_CACHE);
 			brelse(bp);
 		}
 		deallocated += fs->fs_bsize;
@@ -1095,6 +1131,18 @@ fail:
 	 * cleared, free the blocks.
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
+#ifdef INVARIANTS
+		if (blkp == allociblk)
+			lbns_remfree = lbns;
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+		    GB_NOCREAT | GB_UNMAPPED);
+		if (bp != NULL) {
+			panic("zombie2 %jd %ju %ju",
+			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
+			    (uintmax_t)fsbtodb(fs, *blkp));
+		}
+		lbns_remfree++;
+#endif
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
author	Renato Botelho <renato@netgate.com>	2016-08-25 10:41:37 -0300
committer	Renato Botelho <renato@netgate.com>	2016-08-25 10:41:37 -0300
commit	29ebd1247162a77db08e5e2e00d033220ec807fe (patch)
tree	d45bd4c2da327a132f18b6f39db36fe188c4e029 /sys
parent	75cd8d40056c799f03b759475d9bfd10ba266a6c (diff)
parent	c29dc2b4296960868edafe94ebf975be284200bb (diff)
download	FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.zip FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.tar.gz