summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorRenato Botelho <renato@netgate.com>2016-08-25 10:41:37 -0300
committerRenato Botelho <renato@netgate.com>2016-08-25 10:41:37 -0300
commit29ebd1247162a77db08e5e2e00d033220ec807fe (patch)
treed45bd4c2da327a132f18b6f39db36fe188c4e029 /sys
parent75cd8d40056c799f03b759475d9bfd10ba266a6c (diff)
parentc29dc2b4296960868edafe94ebf975be284200bb (diff)
downloadFreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.zip
FreeBSD-src-29ebd1247162a77db08e5e2e00d033220ec807fe.tar.gz
Merge remote-tracking branch 'origin/stable/10' into devel
Diffstat (limited to 'sys')
-rw-r--r--sys/boot/efi/loader/Makefile12
-rw-r--r--sys/cam/cam_ccb.h12
-rw-r--r--sys/cam/ctl/ctl.c2
-rw-r--r--sys/cam/ctl/scsi_ctl.c23
-rw-r--r--sys/cddl/compat/opensolaris/sys/vnode.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c449
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c2563
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c88
-rw-r--r--sys/conf/files.amd645
-rw-r--r--sys/conf/files.i3865
-rw-r--r--sys/dev/ahci/ahci.c4
-rw-r--r--sys/dev/ahci/ahci_pci.c2
-rw-r--r--sys/dev/e1000/e1000_api.c4
-rw-r--r--sys/dev/e1000/e1000_hw.h8
-rw-r--r--sys/dev/e1000/e1000_ich8lan.c17
-rw-r--r--sys/dev/e1000/e1000_ich8lan.h2
-rw-r--r--sys/dev/e1000/e1000_phy.c16
-rw-r--r--sys/dev/e1000/if_em.c6
-rw-r--r--sys/dev/filemon/filemon.c2
-rw-r--r--sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c155
-rw-r--r--sys/dev/hyperv/storvsc/hv_vstorage.h6
-rw-r--r--sys/dev/isp/isp.c5
-rw-r--r--sys/dev/isp/isp_freebsd.c24
-rw-r--r--sys/dev/ntb/if_ntb/if_ntb.c1807
-rw-r--r--sys/dev/ntb/ntb.c463
-rw-r--r--sys/dev/ntb/ntb.h409
-rw-r--r--sys/dev/ntb/ntb_hw/ntb_hw.c1357
-rw-r--r--sys/dev/ntb/ntb_hw/ntb_hw.h125
-rw-r--r--sys/dev/ntb/ntb_hw/ntb_regs.h3
-rw-r--r--sys/dev/ntb/ntb_if.m210
-rw-r--r--sys/dev/ntb/ntb_transport.c1521
-rw-r--r--sys/dev/ntb/ntb_transport.h61
-rw-r--r--sys/dev/pci/pci.c4
-rw-r--r--sys/dev/sfxge/sfxge_ev.c2
-rw-r--r--sys/dev/sfxge/sfxge_port.c4
-rw-r--r--sys/kern/kern_fork.c3
-rw-r--r--sys/kern/kern_sig.c2
-rw-r--r--sys/kern/subr_syscall.c8
-rw-r--r--sys/kern/sys_process.c17
-rw-r--r--sys/kern/vfs_aio.c7
-rw-r--r--sys/kern/vfs_default.c1
-rw-r--r--sys/modules/ntb/Makefile2
-rw-r--r--sys/modules/ntb/ntb/Makefile11
-rw-r--r--sys/modules/ntb/ntb_hw/Makefile4
-rw-r--r--sys/modules/ntb/ntb_transport/Makefile11
-rw-r--r--sys/net/if_bridge.c3
-rw-r--r--sys/netpfil/pf/pf.c12
-rw-r--r--sys/sys/param.h2
-rw-r--r--sys/sys/proc.h5
-rw-r--r--sys/sys/ptrace.h3
-rw-r--r--sys/ufs/ffs/ffs_balloc.c68
57 files changed, 4698 insertions, 4941 deletions
diff --git a/sys/boot/efi/loader/Makefile b/sys/boot/efi/loader/Makefile
index bc38ea6..2c430ec 100644
--- a/sys/boot/efi/loader/Makefile
+++ b/sys/boot/efi/loader/Makefile
@@ -50,6 +50,18 @@ CFLAGS+= -DEFI_ZFS_BOOT
.endif
CFLAGS+= -DNO_PCI -DEFI
+.if !defined(BOOT_HIDE_SERIAL_NUMBERS)
+# Export serial numbers, UUID, and asset tag from loader.
+CFLAGS+= -DSMBIOS_SERIAL_NUMBERS
+.if defined(BOOT_LITTLE_ENDIAN_UUID)
+# Use little-endian UUID format as defined in SMBIOS 2.6.
+CFLAGS+= -DSMBIOS_LITTLE_ENDIAN_UUID
+.elif defined(BOOT_NETWORK_ENDIAN_UUID)
+# Use network-endian UUID format for backward compatibility.
+CFLAGS+= -DSMBIOS_NETWORK_ENDIAN_UUID
+.endif
+.endif
+
.if ${MK_FORTH} != "no"
BOOT_FORTH= yes
CFLAGS+= -DBOOT_FORTH
diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h
index 251d62d..1d56ac7 100644
--- a/sys/cam/cam_ccb.h
+++ b/sys/cam/cam_ccb.h
@@ -1084,7 +1084,17 @@ struct ccb_notify_acknowledge {
u_int tag_id; /* Tag for immediate notify */
u_int seq_id; /* Tar for target of notify */
u_int initiator_id; /* Initiator Identifier */
- u_int arg; /* Function specific */
+ u_int arg; /* Response information */
+ /*
+ * Lower byte of arg is one of RESPONSE CODE values defined below
+ * (subset of response codes from SPL-4 and FCP-4 specifications),
+ * upper 3 bytes is code-specific ADDITIONAL RESPONSE INFORMATION.
+ */
+#define CAM_RSP_TMF_COMPLETE 0x00
+#define CAM_RSP_TMF_REJECTED 0x04
+#define CAM_RSP_TMF_FAILED 0x05
+#define CAM_RSP_TMF_SUCCEEDED 0x08
+#define CAM_RSP_TMF_INCORRECT_LUN 0x09
};
/* HBA engine structures. */
diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
index 5455eea..8ec048b 100644
--- a/sys/cam/ctl/ctl.c
+++ b/sys/cam/ctl/ctl.c
@@ -1818,6 +1818,7 @@ ctl_init(void)
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
softc->flags = 0;
+ TUNABLE_INT_FETCH("kern.cam.ctl.ha_mode", (int *)&softc->ha_mode);
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "ha_mode", CTLFLAG_RDTUN, (int *)&softc->ha_mode, 0,
"HA mode (0 - act/stby, 1 - serialize only, 2 - xfer)");
@@ -1827,6 +1828,7 @@ ctl_init(void)
* figured out through the slot the controller is in. Although it
* is an active/active system, someone has to be in charge.
*/
+ TUNABLE_INT_FETCH("kern.cam.ctl.ha_id", &softc->ha_id);
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "ha_id", CTLFLAG_RDTUN, &softc->ha_id, 0,
"HA head ID (0 - no HA)");
diff --git a/sys/cam/ctl/scsi_ctl.c b/sys/cam/ctl/scsi_ctl.c
index 2705240..abdbdcd 100644
--- a/sys/cam/ctl/scsi_ctl.c
+++ b/sys/cam/ctl/scsi_ctl.c
@@ -1552,6 +1552,7 @@ ctlfedone(struct cam_periph *periph, union ccb *done_ccb)
/*
* Queue this back down to the SIM as an immediate notify.
*/
+ done_ccb->ccb_h.status = CAM_REQ_INPROG;
done_ccb->ccb_h.func_code = XPT_IMMEDIATE_NOTIFY;
xpt_action(done_ccb);
break;
@@ -2040,6 +2041,28 @@ ctlfe_done(union ctl_io *io)
*/
ccb->ccb_h.status = CAM_REQ_INPROG;
ccb->ccb_h.func_code = XPT_NOTIFY_ACKNOWLEDGE;
+ switch (io->taskio.task_status) {
+ case CTL_TASK_FUNCTION_COMPLETE:
+ ccb->cna2.arg = CAM_RSP_TMF_COMPLETE;
+ break;
+ case CTL_TASK_FUNCTION_SUCCEEDED:
+ ccb->cna2.arg = CAM_RSP_TMF_SUCCEEDED;
+ ccb->ccb_h.flags |= CAM_SEND_STATUS;
+ break;
+ case CTL_TASK_FUNCTION_REJECTED:
+ ccb->cna2.arg = CAM_RSP_TMF_REJECTED;
+ ccb->ccb_h.flags |= CAM_SEND_STATUS;
+ break;
+ case CTL_TASK_LUN_DOES_NOT_EXIST:
+ ccb->cna2.arg = CAM_RSP_TMF_INCORRECT_LUN;
+ ccb->ccb_h.flags |= CAM_SEND_STATUS;
+ break;
+ case CTL_TASK_FUNCTION_NOT_SUPPORTED:
+ ccb->cna2.arg = CAM_RSP_TMF_FAILED;
+ ccb->ccb_h.flags |= CAM_SEND_STATUS;
+ break;
+ }
+ ccb->cna2.arg |= scsi_3btoul(io->taskio.task_resp) << 8;
xpt_action(ccb);
} else if (io->io_hdr.flags & CTL_FLAG_STATUS_SENT) {
if (softc->flags & CTLFE_LUN_WILDCARD) {
diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h
index 4e5b1c9..019efdf 100644
--- a/sys/cddl/compat/opensolaris/sys/vnode.h
+++ b/sys/cddl/compat/opensolaris/sys/vnode.h
@@ -87,8 +87,6 @@ vn_is_readonly(vnode_t *vp)
#define VN_RELE(v) vrele(v)
#define VN_URELE(v) vput(v)
-#define VOP_REALVP(vp, vpp, ct) (*(vpp) = (vp), 0)
-
#define vnevent_create(vp, ct) do { } while (0)
#define vnevent_link(vp, ct) do { } while (0)
#define vnevent_remove(vp, dvp, name, ct) do { } while (0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
index 349f8ef..22d8e60 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -48,18 +48,18 @@ extern "C" {
#define IS_ROOT_NODE 0x01 /* create a root node */
#define IS_XATTR 0x02 /* create an extended attribute node */
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
- int, int *, pathname_t *);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
+extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
- pathname_t *);
+#if 0
+extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
+#else
+extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
+#endif
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
uint_t, znode_t **, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *);
-extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
index 4120883..df5ce05 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -75,6 +75,7 @@ struct zfsvfs {
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
boolean_t z_use_sa; /* version allow system attributes */
+ boolean_t z_use_namecache;/* make use of FreeBSD name cache */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index 3e72ec4..7649295 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -181,10 +181,12 @@ typedef struct znode {
struct zfsvfs *z_zfsvfs;
vnode_t *z_vnode;
uint64_t z_id; /* object ID for this znode */
+#ifdef illumos
kmutex_t z_lock; /* znode modification lock */
krwlock_t z_parent_lock; /* parent lock for directories */
krwlock_t z_name_lock; /* "master" lock for dirent locks */
zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
+#endif
kmutex_t z_range_lock; /* protects changes to z_range_avl */
avl_tree_t z_range_avl; /* avl tree of file range locks */
uint8_t z_unlinked; /* file has been unlinked */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index fd1d59b..2e94ccc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -1058,8 +1058,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
* create a new acl and leave any cached acl in place.
*/
static int
-zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
- boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
{
zfs_acl_t *aclp;
int aclsize;
@@ -1068,26 +1067,15 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
zfs_acl_phys_t znode_acl;
int version;
int error;
- boolean_t drop_lock = B_FALSE;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
if (zp->z_acl_cached && !will_modify) {
*aclpp = zp->z_acl_cached;
return (0);
}
- /*
- * close race where znode could be upgrade while trying to
- * read the znode attributes.
- *
- * But this could only happen if the file isn't already an SA
- * znode
- */
- if (!zp->z_is_sa && !have_lock) {
- mutex_enter(&zp->z_lock);
- drop_lock = B_TRUE;
- }
version = zfs_znode_acl_version(zp);
if ((error = zfs_acl_znode_info(zp, &aclsize,
@@ -1133,8 +1121,6 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
if (!will_modify)
zp->z_acl_cached = aclp;
done:
- if (drop_lock)
- mutex_exit(&zp->z_lock);
return (error);
}
@@ -1161,10 +1147,10 @@ zfs_acl_chown_setattr(znode_t *zp)
int error;
zfs_acl_t *aclp;
- ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+ if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
&zp->z_pflags, zp->z_uid, zp->z_gid);
return (error);
@@ -1445,18 +1431,17 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
int error = 0;
mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
else
- error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+ error = zfs_acl_node_read(zp, aclp, B_TRUE);
if (error == 0) {
(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
zfs_acl_chmod(ZTOV(zp)->v_type, mode,
(zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
}
- mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -1627,6 +1612,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
boolean_t need_chmod = B_TRUE;
boolean_t inherited = B_FALSE;
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
bzero(acl_ids, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1710,12 +1696,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
if (acl_ids->z_aclp == NULL) {
mutex_enter(&dzp->z_acl_lock);
- mutex_enter(&dzp->z_lock);
if (!(flag & IS_ROOT_NODE) &&
(dzp->z_pflags & ZFS_INHERIT_ACE) &&
!(dzp->z_pflags & ZFS_XATTR)) {
- VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
- &paclp, B_FALSE));
+ VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
inherited = B_TRUE;
@@ -1724,7 +1708,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
zfs_acl_alloc(zfs_acl_version_zp(dzp));
acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
}
- mutex_exit(&dzp->z_lock);
mutex_exit(&dzp->z_acl_lock);
if (need_chmod) {
acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
@@ -1790,7 +1773,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -1938,6 +1922,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
boolean_t fuid_dirtied;
uint64_t acl_obj;
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (mask == 0)
return (SET_ERROR(ENOSYS));
@@ -1962,7 +1947,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
}
top:
mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
tx = dmu_tx_create(zfsvfs->z_os);
@@ -1994,7 +1978,6 @@ top:
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
- mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock);
if (error == ERESTART) {
@@ -2020,7 +2003,6 @@ top:
if (fuidp)
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx);
- mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -2124,7 +2106,8 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index cf42ff6..f8f695b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -58,96 +58,64 @@
#include <sys/extdirent.h>
/*
- * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
* of names after deciding which is the appropriate lookup interface.
*/
static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
- boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+ boolean_t exact, uint64_t *zoid)
{
int error;
if (zfsvfs->z_norm) {
- matchtype_t mt = MT_FIRST;
- boolean_t conflict = B_FALSE;
- size_t bufsz = 0;
- char *buf = NULL;
-
- if (rpnp) {
- buf = rpnp->pn_buf;
- bufsz = rpnp->pn_bufsize;
- }
- if (exact)
- mt = MT_EXACT;
+ matchtype_t mt = exact? MT_EXACT : MT_FIRST;
+
/*
* In the non-mixed case we only expect there would ever
* be one match, but we need to use the normalizing lookup.
*/
error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
- zoid, mt, buf, bufsz, &conflict);
- if (!error && deflags)
- *deflags = conflict ? ED_CASE_CONFLICT : 0;
+ zoid, mt, NULL, 0, NULL);
} else {
error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
}
*zoid = ZFS_DIRENT_OBJ(*zoid);
- if (error == ENOENT && update)
- dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-
return (error);
}
/*
- * Lock a directory entry. A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object. As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
*
* Input arguments:
* dzp - znode for directory
* name - name of entry to lock
* flag - ZNEW: if the entry already exists, fail with EEXIST.
* ZEXISTS: if the entry does not exist, fail with ENOENT.
- * ZSHARED: allow concurrent access with other ZSHARED callers.
* ZXATTR: we want dzp's xattr directory
- * ZCILOOK: On a mixed sensitivity file system,
- * this lookup should be case-insensitive.
- * ZCIEXACT: On a purely case-insensitive file system,
- * this lookup should be case-sensitive.
- * ZRENAMING: we are locking for renaming, force narrow locks
- * ZHAVELOCK: Don't grab the z_name_lock for this call. The
- * current thread already holds it.
*
* Output arguments:
* zpp - pointer to the znode for the entry (NULL if there isn't one)
- * dlpp - pointer to the dirlock for this entry (NULL on error)
- * direntflags - (case-insensitive lookup only)
- * flags if multiple case-sensitive matches exist in directory
- * realpnp - (case-insensitive lookup only)
- * actual name matched within the directory
*
* Return value: 0 on success or errno on failure.
*
* NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- * but return znode pointers to a single match.
*/
int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
- int flag, int *direntflags, pathname_t *realpnp)
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
{
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t *dl;
- boolean_t update;
boolean_t exact;
uint64_t zoid;
vnode_t *vp = NULL;
int error = 0;
- int cmpflags;
+
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
*zpp = NULL;
- *dlpp = NULL;
/*
* Verify that we are not trying to lock '.', '..', or '.zfs'
@@ -161,280 +129,93 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
* Case sensitivity and normalization preferences are set when
* the file system is created. These are stored in the
* zfsvfs->z_case and zfsvfs->z_norm fields. These choices
- * affect what vnodes can be cached in the DNLC, how we
- * perform zap lookups, and the "width" of our dirlocks.
+ * affect how we perform zap lookups.
*
- * A normal dirlock locks a single name. Note that with
- * normalization a name can be composed multiple ways, but
- * when normalized, these names all compare equal. A wide
- * dirlock locks multiple names. We need these when the file
- * system is supporting mixed-mode access. It is sometimes
- * necessary to lock all case permutations of file name at
- * once so that simultaneous case-insensitive/case-sensitive
- * behaves as rationally as possible.
- */
-
- /*
* Decide if exact matches should be requested when performing
* a zap lookup on file systems supporting case-insensitive
* access.
- */
- exact =
- ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
- ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
-
- /*
- * Only look in or update the DNLC if we are looking for the
- * name on a file system that does not require normalization
- * or case folding. We can also look there if we happen to be
- * on a non-normalizing, mixed sensitivity file system IF we
- * are looking for the exact name.
*
- * Maybe can add TO-UPPERed version of name to dnlc in ci-only
- * case for performance improvement?
- */
- update = !zfsvfs->z_norm ||
- ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
-
- /*
- * ZRENAMING indicates we are in a situation where we should
- * take narrow locks regardless of the file system's
- * preferences for normalizing and case folding. This will
- * prevent us deadlocking trying to grab the same wide lock
- * twice if the two names happen to be case-insensitive
- * matches.
- */
- if (flag & ZRENAMING)
- cmpflags = 0;
- else
- cmpflags = zfsvfs->z_norm;
-
- /*
- * Wait until there are no locks on this name.
- *
- * Don't grab the the lock if it is already held. However, cannot
- * have both ZSHARED and ZHAVELOCK together.
- */
- ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
- if (!(flag & ZHAVELOCK))
- rw_enter(&dzp->z_name_lock, RW_READER);
-
- mutex_enter(&dzp->z_lock);
- for (;;) {
- if (dzp->z_unlinked && !(flag & ZXATTR)) {
- mutex_exit(&dzp->z_lock);
- if (!(flag & ZHAVELOCK))
- rw_exit(&dzp->z_name_lock);
- return (SET_ERROR(ENOENT));
- }
- for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
- if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
- U8_UNICODE_LATEST, &error) == 0) || error != 0)
- break;
- }
- if (error != 0) {
- mutex_exit(&dzp->z_lock);
- if (!(flag & ZHAVELOCK))
- rw_exit(&dzp->z_name_lock);
- return (SET_ERROR(ENOENT));
- }
- if (dl == NULL) {
- size_t namesize;
-
- /*
- * Allocate a new dirlock and add it to the list.
- */
- namesize = strlen(name) + 1;
- dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize,
- KM_SLEEP);
- cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
- dl->dl_name = (char *)(dl + 1);
- bcopy(name, dl->dl_name, namesize);
- dl->dl_sharecnt = 0;
- dl->dl_namelock = 0;
- dl->dl_namesize = namesize;
- dl->dl_dzp = dzp;
- dl->dl_next = dzp->z_dirlocks;
- dzp->z_dirlocks = dl;
- break;
- }
- if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
- break;
- cv_wait(&dl->dl_cv, &dzp->z_lock);
- }
-
- /*
- * If the z_name_lock was NOT held for this dirlock record it.
+ * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+ * because in that case MT_EXACT and MT_FIRST should produce exactly
+ * the same result.
*/
- if (flag & ZHAVELOCK)
- dl->dl_namelock = 1;
+ exact = zfsvfs->z_case == ZFS_CASE_MIXED;
- if (flag & ZSHARED)
- dl->dl_sharecnt++;
-
- mutex_exit(&dzp->z_lock);
-
- /*
- * We have a dirlock on the name. (Note that it is the dirlock,
- * not the dzp's z_lock, that protects the name in the zap object.)
- * See if there's an object by this name; if so, put a hold on it.
- */
+ if (dzp->z_unlinked && !(flag & ZXATTR))
+ return (ENOENT);
if (flag & ZXATTR) {
error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
sizeof (zoid));
if (error == 0)
error = (zoid == 0 ? ENOENT : 0);
} else {
- if (update)
- vp = dnlc_lookup(ZTOV(dzp), name);
- if (vp == DNLC_NO_VNODE) {
- VN_RELE(vp);
- error = SET_ERROR(ENOENT);
- } else if (vp) {
- if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- return (SET_ERROR(EEXIST));
- }
- *dlpp = dl;
- *zpp = VTOZ(vp);
- return (0);
- } else {
- error = zfs_match_find(zfsvfs, dzp, name, exact,
- update, direntflags, realpnp, &zoid);
- }
+ error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
}
if (error) {
if (error != ENOENT || (flag & ZEXISTS)) {
- zfs_dirent_unlock(dl);
return (error);
}
} else {
if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
return (SET_ERROR(EEXIST));
}
error = zfs_zget(zfsvfs, zoid, zpp);
- if (error) {
- zfs_dirent_unlock(dl);
+ if (error)
return (error);
- }
- if (!(flag & ZXATTR) && update)
- dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+ ASSERT(!(*zpp)->z_unlinked);
}
- *dlpp = dl;
-
return (0);
}
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
{
- znode_t *dzp = dl->dl_dzp;
- zfs_dirlock_t **prev_dl, *cur_dl;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ uint64_t parent;
+ int error;
- mutex_enter(&dzp->z_lock);
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
- if (!dl->dl_namelock)
- rw_exit(&dzp->z_name_lock);
+ if (dzp->z_unlinked)
+ return (ENOENT);
- if (dl->dl_sharecnt > 1) {
- dl->dl_sharecnt--;
- mutex_exit(&dzp->z_lock);
- return;
- }
- prev_dl = &dzp->z_dirlocks;
- while ((cur_dl = *prev_dl) != dl)
- prev_dl = &cur_dl->dl_next;
- *prev_dl = dl->dl_next;
- cv_broadcast(&dl->dl_cv);
- mutex_exit(&dzp->z_lock);
-
- cv_destroy(&dl->dl_cv);
- kmem_free(dl, sizeof (*dl) + dl->dl_namesize);
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+
+ error = zfs_zget(zfsvfs, parent, &zp);
+ if (error == 0)
+ *zpp = zp;
+ return (error);
}
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- * no directory entries are actually stored for them. If this is
- * the root of a filesystem, then '.zfs' is also treated as a
- * special pseudo-directory.
- */
int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
- int *deflg, pathname_t *rpnp)
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
{
- zfs_dirlock_t *dl;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
znode_t *zp;
int error = 0;
- uint64_t parent;
- int unlinked;
-
- if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
- mutex_enter(&dzp->z_lock);
- unlinked = dzp->z_unlinked;
- mutex_exit(&dzp->z_lock);
- if (unlinked)
- return (ENOENT);
-
- *vpp = ZTOV(dzp);
- VN_HOLD(*vpp);
- } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- /*
- * If we are a snapshot mounted under .zfs, return
- * the vp for the snapshot directory.
- */
- if ((error = sa_lookup(dzp->z_sa_hdl,
- SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
- return (error);
- if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
- error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
- "snapshot", vpp, NULL, 0, NULL, kcred,
- NULL, NULL, NULL);
- return (error);
- }
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
- mutex_enter(&dzp->z_lock);
- unlinked = dzp->z_unlinked;
- mutex_exit(&dzp->z_lock);
- if (unlinked)
- return (ENOENT);
+ if (dzp->z_unlinked)
+ return (SET_ERROR(ENOENT));
- rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, parent, &zp);
- if (error == 0)
- *vpp = ZTOV(zp);
- rw_exit(&dzp->z_parent_lock);
- } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
- *vpp = zfsctl_root(dzp);
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *zpp = dzp;
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ error = zfs_dd_lookup(dzp, zpp);
} else {
- int zf;
-
- zf = ZEXISTS | ZSHARED;
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+ error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
if (error == 0) {
- *vpp = ZTOV(zp);
- zfs_dirent_unlock(dl);
dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ *zpp = zp;
}
- rpnp = NULL;
}
-
- if ((flags & FIGNORECASE) && rpnp && !error)
- (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
return (error);
}
@@ -510,8 +291,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
if (error != 0)
continue;
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
zp->z_unlinked = B_TRUE;
- VN_RELE(ZTOV(zp));
+ vput(ZTOV(zp));
}
zap_cursor_fini(&zc);
}
@@ -535,7 +317,6 @@ zfs_purgedir(znode_t *dzp)
znode_t *xzp;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t dl;
int skipped = 0;
int error;
@@ -549,6 +330,7 @@ zfs_purgedir(znode_t *dzp)
continue;
}
+ vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
ASSERT((ZTOV(xzp)->v_type == VREG) ||
(ZTOV(xzp)->v_type == VLNK));
@@ -563,20 +345,17 @@ zfs_purgedir(znode_t *dzp)
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- VN_RELE(ZTOV(xzp));
+ vput(ZTOV(xzp));
skipped += 1;
continue;
}
- bzero(&dl, sizeof (dl));
- dl.dl_dzp = dzp;
- dl.dl_name = zap.za_name;
- error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+ error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
if (error)
skipped += 1;
dmu_tx_commit(tx);
- VN_RELE(ZTOV(xzp));
+ vput(ZTOV(xzp));
}
zap_cursor_fini(&zc);
if (error != ENOENT)
@@ -596,6 +375,7 @@ zfs_rmnode(znode_t *zp)
int error;
ASSERT(zp->z_links == 0);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
/*
* If this is an attribute directory, purge its contents.
@@ -634,7 +414,8 @@ zfs_rmnode(znode_t *zp)
&xattr_obj, sizeof (xattr_obj));
if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT(error == 0);
+ ASSERT3S(error, ==, 0);
+ vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
}
acl_obj = zfs_external_acl(zp);
@@ -668,12 +449,10 @@ zfs_rmnode(znode_t *zp)
if (xzp) {
ASSERT(error == 0);
- mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
xzp->z_links = 0; /* no more links to it */
VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
&xzp->z_links, sizeof (xzp->z_links), tx));
- mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
}
@@ -686,7 +465,7 @@ zfs_rmnode(znode_t *zp)
dmu_tx_commit(tx);
out:
if (xzp)
- VN_RELE(ZTOV(xzp));
+ vput(ZTOV(xzp));
}
static uint64_t
@@ -700,12 +479,12 @@ zfs_dirent(znode_t *zp, uint64_t mode)
}
/*
- * Link zp into dl. Can only fail if zp has been unlinked.
+ * Link zp into dzp. Can only fail if zp has been unlinked.
*/
int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
{
- znode_t *dzp = dl->dl_dzp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
uint64_t value;
@@ -715,18 +494,32 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
int count = 0;
int error;
- mutex_enter(&zp->z_lock);
-
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#if 0
+ if (zp_is_dir) {
+ error = 0;
+ if (dzp->z_links >= LINK_MAX)
+ error = SET_ERROR(EMLINK);
+ return (error);
+ }
+#endif
if (!(flag & ZRENAMING)) {
if (zp->z_unlinked) { /* no new links to unlinked zp */
ASSERT(!(flag & (ZNEW | ZEXISTS)));
- mutex_exit(&zp->z_lock);
return (SET_ERROR(ENOENT));
}
+#if 0
+ if (zp->z_links >= LINK_MAX) {
+ return (SET_ERROR(EMLINK));
+ }
+#endif
zp->z_links++;
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
&zp->z_links, sizeof (zp->z_links));
+ } else {
+ ASSERT(zp->z_unlinked == 0);
}
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
&dzp->z_id, sizeof (dzp->z_id));
@@ -740,11 +533,8 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
ctime, B_TRUE);
}
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
-
- mutex_exit(&zp->z_lock);
+ ASSERT0(error);
- mutex_enter(&dzp->z_lock);
dzp->z_size++;
dzp->z_links += zp_is_dir;
count = 0;
@@ -760,55 +550,48 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
&dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
- mutex_exit(&dzp->z_lock);
+ ASSERT0(error);
value = zfs_dirent(zp, zp->z_mode);
- error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
8, 1, &value, tx);
- ASSERT(error == 0);
-
- dnlc_update(ZTOV(dzp), dl->dl_name, vp);
+ VERIFY0(error);
return (0);
}
static int
-zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
int flag)
{
int error;
if (zp->z_zfsvfs->z_norm) {
- if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
- (flag & ZCIEXACT)) ||
- ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(flag & ZCILOOK)))
+ if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_EXACT, tx);
+ dzp->z_id, name, MT_EXACT, tx);
else
error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_FIRST, tx);
+ dzp->z_id, name, MT_FIRST, tx);
} else {
error = zap_remove(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, tx);
+ dzp->z_id, name, tx);
}
return (error);
}
/*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
* If it's non-NULL, we use it to indicate whether the znode needs deletion,
* and it's the caller's job to do it.
*/
int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
- boolean_t *unlinkedp)
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag, boolean_t *unlinkedp)
{
- znode_t *dzp = dl->dl_dzp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
int zp_is_dir = (vp->v_type == VDIR);
@@ -818,22 +601,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
int count = 0;
int error;
- dnlc_remove(ZTOV(dzp), dl->dl_name);
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (!(flag & ZRENAMING)) {
- if (vn_vfswlock(vp)) /* prevent new mounts on zp */
- return (SET_ERROR(EBUSY));
-
- if (vn_ismntpt(vp)) { /* don't remove mount point */
- vn_vfsunlock(vp);
- return (SET_ERROR(EBUSY));
- }
-
- mutex_enter(&zp->z_lock);
if (zp_is_dir && !zfs_dirempty(zp)) {
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
#ifdef illumos
return (SET_ERROR(EEXIST));
#else
@@ -846,10 +619,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
* First try removing the name from the directory; if that
* fails, return the error.
*/
- error = zfs_dropname(dl, zp, dzp, tx, flag);
+ error = zfs_dropname(dzp, name, zp, tx, flag);
if (error != 0) {
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
return (error);
}
@@ -876,16 +647,14 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, &zp->z_links, sizeof (zp->z_links));
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
count = 0;
- ASSERT(error == 0);
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
+ ASSERT0(error);
} else {
- error = zfs_dropname(dl, zp, dzp, tx, flag);
+ ASSERT(zp->z_unlinked == 0);
+ error = zfs_dropname(dzp, name, zp, tx, flag);
if (error != 0)
return (error);
}
- mutex_enter(&dzp->z_lock);
dzp->z_size--; /* one dirent removed */
dzp->z_links -= zp_is_dir; /* ".." link from zp */
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
@@ -900,8 +669,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
- mutex_exit(&dzp->z_lock);
+ ASSERT0(error);
if (unlinkedp != NULL)
*unlinkedp = unlinked;
@@ -912,14 +680,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
}
/*
- * Indicate whether the directory is empty. Works with or without z_lock
- * held, but can only be consider a hint in the latter case. Returns true
- * if only "." and ".." remain and there's no work in progress.
+ * Indicate whether the directory is empty.
*/
boolean_t
zfs_dirempty(znode_t *dzp)
{
- return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
+ return (dzp->z_size == 2);
}
int
@@ -1013,23 +779,20 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
znode_t *xzp;
- zfs_dirlock_t *dl;
vattr_t va;
int error;
top:
- error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+ error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
if (error)
return (error);
if (xzp != NULL) {
*xvpp = ZTOV(xzp);
- zfs_dirent_unlock(dl);
return (0);
}
if (!(flags & CREATE_XATTR_DIR)) {
- zfs_dirent_unlock(dl);
#ifdef illumos
return (SET_ERROR(ENOENT));
#else
@@ -1038,7 +801,6 @@ top:
}
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
- zfs_dirent_unlock(dl);
return (SET_ERROR(EROFS));
}
@@ -1058,7 +820,6 @@ top:
zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
error = zfs_make_xattrdir(zp, &va, xvpp, cr);
- zfs_dirent_unlock(dl);
if (error == ERESTART) {
/* NB: we already did dmu_tx_wait() if necessary */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
index 3a472aa..819eca2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
@@ -124,7 +124,7 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap;
- ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa) {
if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -158,7 +158,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap;
- ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa)
VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -205,7 +205,6 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
uint64_t crtime[2], mtime[2], ctime[2];
zfs_acl_phys_t znode_acl;
char scanstamp[AV_SCANSTAMP_SZ];
- boolean_t drop_lock = B_FALSE;
/*
* No upgrade if ACL isn't cached
@@ -217,20 +216,16 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
return;
/*
- * If the z_lock is held and we aren't the owner
- * the just return since we don't want to deadlock
+ * If the vnode lock is held and we aren't the owner
+ * then just return since we don't want to deadlock
* trying to update the status of z_is_sa. This
* file can then be upgraded at a later time.
*
* Otherwise, we know we are doing the
* sa_update() that caused us to enter this function.
*/
- if (mutex_owner(&zp->z_lock) != curthread) {
- if (mutex_tryenter(&zp->z_lock) == 0)
+ if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
return;
- else
- drop_lock = B_TRUE;
- }
/* First do a bulk query of the attributes that aren't cached */
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
@@ -311,8 +306,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
zp->z_is_sa = B_TRUE;
done:
- if (drop_lock)
- mutex_exit(&zp->z_lock);
+ VOP_UNLOCK(ZTOV(zp), 0);
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 8523bc4..aa711f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -957,6 +957,18 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
else if (error != 0)
return (error);
+ /*
+ * Only use the name cache if we are looking for a
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name (which is always the case on
+ * FreeBSD).
+ */
+ zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+ ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+ !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
return (0);
}
@@ -997,7 +1009,11 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
+#ifdef DIAGNOSTIC
+ rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -2044,7 +2060,7 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
ZFS_ENTER(zfsvfs);
err = zfs_zget(zfsvfs, ino, &zp);
if (err == 0 && zp->z_unlinked) {
- VN_RELE(ZTOV(zp));
+ vrele(ZTOV(zp));
err = EINVAL;
}
if (err == 0)
@@ -2145,7 +2161,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
0, NULL, NULL, NULL, NULL, NULL) == 0);
} else {
- VN_HOLD(*vpp);
+ vref(*vpp);
}
ZFS_EXIT(zfsvfs);
err = vn_lock(*vpp, flags);
@@ -2168,7 +2184,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) {
dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
- VN_RELE(ZTOV(zp));
+ vrele(ZTOV(zp));
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 17179f6..e2fe974 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -66,7 +66,6 @@
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/zfs_sa.h>
-#include <sys/dnlc.h>
#include <sys/zfs_rlock.h>
#include <sys/extdirent.h>
#include <sys/kidmap.h>
@@ -147,7 +146,7 @@
*
* ZFS_ENTER(zfsvfs); // exit if unmounted
* top:
- * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
+ * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
* rw_enter(...); // grab any other locks you need
* tx = dmu_tx_create(...); // get DMU tx
* dmu_tx_hold_*(); // hold each object you might modify
@@ -1433,26 +1432,81 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
return (error);
}
-/*
- * If vnode is for a device return a specfs vnode instead.
- */
static int
-specvp_check(vnode_t **vpp, cred_t *cr)
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
{
- int error = 0;
-
- if (IS_DEVVP(*vpp)) {
- struct vnode *svp;
+ int error;
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL)
- error = SET_ERROR(ENOSYS);
- *vpp = svp;
- }
+ *vpp = arg;
+ error = vn_lock(*vpp, lkflags);
+ if (error != 0)
+ vrele(*vpp);
return (error);
}
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error;
+ int ltype;
+
+ ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+ ASSERT(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ ASSERT3P(dvp, ==, vp);
+ vref(dvp);
+ ltype = lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /*
+ * Relock for the "." case could leave us with
+ * reclaimed vnode.
+ */
+ if (dvp->v_iflag & VI_DOOMED) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ /*
+ * Note that in this case, dvp is the child vnode, and we
+ * are looking up the parent vnode - exactly reverse from
+ * normal operation. Unlocking dvp requires some rather
+ * tricky unlock/relock dance to prevent mp from being freed;
+ * use vn_vget_ino_gen() which takes care of all that.
+ *
+ * XXX Note that there is a time window when both vnodes are
+ * unlocked. It is possible, although highly unlikely, that
+ * during that window the parent-child relationship between
+ * the vnodes may change, for example, get reversed.
+ * In that case we would have a wrong lock order for the vnodes.
+ * All other filesystems seem to ignore this problem, so we
+ * do the same here.
+ * A potential solution could be implemented as follows:
+ * - using LK_NOWAIT when locking the second vnode and retrying
+ * if necessary
+ * - checking that the parent-child relationship still holds
+ * after locking both vnodes and retrying if it doesn't
+ */
+ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+ return (error);
+ } else {
+ error = vn_lock(vp, lkflags);
+ if (error != 0)
+ vrele(vp);
+ return (error);
+ }
+}
/*
* Lookup an entry in a directory, or an extended attribute directory.
@@ -1465,8 +1519,6 @@ specvp_check(vnode_t **vpp, cred_t *cr)
* rdir - root directory vnode [UNUSED].
* cr - credentials of caller.
* ct - caller context
- * direntflags - directory lookup flags
- * realpnp - returned pathname.
*
* OUT: vpp - vnode of located entry, NULL if not found.
*
@@ -1481,46 +1533,17 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
int nameiop, cred_t *cr, kthread_t *td, int flags)
{
znode_t *zdp = VTOZ(dvp);
+ znode_t *zp;
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
int error = 0;
- int *direntflags = NULL;
- void *realpnp = NULL;
-
- /* fast path */
- if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+ /* fast path (should be redundant with vfs namecache) */
+ if (!(flags & LOOKUP_XATTR)) {
if (dvp->v_type != VDIR) {
return (SET_ERROR(ENOTDIR));
} else if (zdp->z_sa_hdl == NULL) {
return (SET_ERROR(EIO));
}
-
- if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
- error = zfs_fastaccesschk_execute(zdp, cr);
- if (!error) {
- *vpp = dvp;
- VN_HOLD(*vpp);
- return (0);
- }
- return (error);
- } else {
- vnode_t *tvp = dnlc_lookup(dvp, nm);
-
- if (tvp) {
- error = zfs_fastaccesschk_execute(zdp, cr);
- if (error) {
- VN_RELE(tvp);
- return (error);
- }
- if (tvp == DNLC_NO_VNODE) {
- VN_RELE(tvp);
- return (SET_ERROR(ENOENT));
- } else {
- *vpp = tvp;
- return (specvp_check(vpp, cr));
- }
- }
- }
}
DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
@@ -1558,10 +1581,9 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
/*
* Do we have permission to get into attribute directory?
*/
-
if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
B_FALSE, cr)) {
- VN_RELE(*vpp);
+ vrele(*vpp);
*vpp = NULL;
}
@@ -1569,15 +1591,9 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
return (error);
}
- if (dvp->v_type != VDIR) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTDIR));
- }
-
/*
* Check accessibility of directory.
*/
-
if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
@@ -1589,9 +1605,90 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
return (SET_ERROR(EILSEQ));
}
- error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
- if (error == 0)
- error = specvp_check(vpp, cr);
+
+ /*
+ * First handle the special cases.
+ */
+ if ((cnp->cn_flags & ISDOTDOT) != 0) {
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+ error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+ "snapshot", vpp, NULL, 0, NULL, kcred,
+ NULL, NULL, NULL);
+ ZFS_EXIT(zfsvfs);
+ if (error == 0) {
+ error = zfs_lookup_lock(dvp, *vpp, nm,
+ cnp->cn_lkflags);
+ }
+ goto out;
+ }
+ }
+ if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+ error = 0;
+ if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ error = SET_ERROR(ENOTSUP);
+ else
+ *vpp = zfsctl_root(zdp);
+ ZFS_EXIT(zfsvfs);
+ if (error == 0)
+ error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+ goto out;
+ }
+
+ /*
+ * The loop is retry the lookup if the parent-child relationship
+ * changes during the dot-dot locking complexities.
+ */
+ for (;;) {
+ uint64_t parent;
+
+ error = zfs_dirlook(zdp, nm, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+
+ ZFS_EXIT(zfsvfs);
+ if (error != 0)
+ break;
+
+ error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+ if (error != 0) {
+ /*
+ * If we've got a locking error, then the vnode
+ * got reclaimed because of a force unmount.
+ * We never enter doomed vnodes into the name cache.
+ */
+ *vpp = NULL;
+ return (error);
+ }
+
+ if ((cnp->cn_flags & ISDOTDOT) == 0)
+ break;
+
+ ZFS_ENTER(zfsvfs);
+ if (zdp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ } else {
+ error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ }
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ vput(ZTOV(zp));
+ break;
+ }
+ if (zp->z_id == parent) {
+ ZFS_EXIT(zfsvfs);
+ break;
+ }
+ vput(ZTOV(zp));
+ }
+
+out:
+ if (error != 0)
+ *vpp = NULL;
/* Translate errors and add SAVENAME when needed. */
if (cnp->cn_flags & ISLASTCN) {
@@ -1610,42 +1707,20 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
break;
}
}
- if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
- int ltype = 0;
- if (cnp->cn_flags & ISDOTDOT) {
- ltype = VOP_ISLOCKED(dvp);
- VOP_UNLOCK(dvp, 0);
- }
- ZFS_EXIT(zfsvfs);
- error = vn_lock(*vpp, cnp->cn_lkflags);
- if (cnp->cn_flags & ISDOTDOT)
- vn_lock(dvp, ltype | LK_RETRY);
- if (error != 0) {
- VN_RELE(*vpp);
- *vpp = NULL;
- return (error);
- }
- } else {
- ZFS_EXIT(zfsvfs);
- }
+ /* Insert name into cache (as non-existent) if appropriate. */
+ if (zfsvfs->z_use_namecache &&
+ error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(dvp, NULL, cnp);
-#ifdef FREEBSD_NAMECACHE
- /*
- * Insert name into cache (as non-existent) if appropriate.
- */
- if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
- cache_enter(dvp, *vpp, cnp);
- /*
- * Insert name into cache if appropriate.
- */
- if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+ /* Insert name into cache if appropriate. */
+ if (zfsvfs->z_use_namecache &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY)) {
if (!(cnp->cn_flags & ISLASTCN) ||
(nameiop != DELETE && nameiop != RENAME)) {
cache_enter(dvp, *vpp, cnp);
}
}
-#endif
return (error);
}
@@ -1683,7 +1758,6 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
objset_t *os;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
int error;
ksid_t *ksid;
@@ -1691,10 +1765,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
- boolean_t have_acl = B_FALSE;
- boolean_t waited = B_FALSE;
void *vsecp = NULL;
int flag = 0;
+ uint64_t txtype;
/*
* If we have an ephemeral id, ACL, or XVATTR then
@@ -1731,182 +1804,89 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
}
}
- getnewvnode_reserve(1);
-
-top:
*vpp = NULL;
if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
vap->va_mode &= ~S_ISVTX;
- if (*name == '\0') {
- /*
- * Null component name refers to the directory itself.
- */
- VN_HOLD(dvp);
- zp = dzp;
- dl = NULL;
- error = 0;
- } else {
- /* possible VN_HOLD(zp) */
- int zflg = 0;
-
- if (flag & FIGNORECASE)
- zflg |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL);
- if (error) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- if (strcmp(name, "..") == 0)
- error = SET_ERROR(EISDIR);
- getnewvnode_drop_reserve();
- ZFS_EXIT(zfsvfs);
- return (error);
- }
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
}
+ ASSERT3P(zp, ==, NULL);
- if (zp == NULL) {
- uint64_t txtype;
-
- /*
- * Create a new file object and update the directory
- * to reference it.
- */
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- goto out;
- }
-
- /*
- * We only support the creation of regular files in
- * extended attribute directories.
- */
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ goto out;
+ }
- if ((dzp->z_pflags & ZFS_XATTR) &&
- (vap->va_type != VREG)) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- error = SET_ERROR(EINVAL);
- goto out;
- }
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
- if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
- cr, vsecp, &acl_ids)) != 0)
- goto out;
- have_acl = B_TRUE;
+ if ((dzp->z_pflags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
- zfs_acl_ids_free(&acl_ids);
- error = SET_ERROR(EDQUOT);
- goto out;
- }
+ if ((error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
- tx = dmu_tx_create(os);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
- ZFS_SA_BASE_ATTR_SIZE);
+ getnewvnode_reserve(1);
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
- if (!zfsvfs->z_use_sa &&
- acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, acl_ids.z_aclp->z_acl_bytes);
- }
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_abort(tx);
- getnewvnode_drop_reserve();
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+ tx = dmu_tx_create(os);
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
- (void) zfs_link_create(dl, zp, tx, ZNEW);
- txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
- if (flag & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, name,
- vsecp, acl_ids.z_fuidp, vap);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
zfs_acl_ids_free(&acl_ids);
- dmu_tx_commit(tx);
- } else {
- int aflags = (flag & FAPPEND) ? V_APPEND : 0;
-
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- have_acl = B_FALSE;
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
- /*
- * A directory entry already exists for this name.
- */
- /*
- * Can't truncate an existing file if in exclusive mode.
- */
- if (excl == EXCL) {
- error = SET_ERROR(EEXIST);
- goto out;
- }
- /*
- * Can't open a directory for writing.
- */
- if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
- error = SET_ERROR(EISDIR);
- goto out;
- }
- /*
- * Verify requested access to file.
- */
- if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
- goto out;
- }
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
- mutex_enter(&dzp->z_lock);
- dzp->z_seq++;
- mutex_exit(&dzp->z_lock);
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
- /*
- * Truncate regular files if requested.
- */
- if ((ZTOV(zp)->v_type == VREG) &&
- (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
- /* we can't hold any locks when calling zfs_freesp() */
- zfs_dirent_unlock(dl);
- dl = NULL;
- error = zfs_freesp(zp, 0, 0, mode, TRUE);
- if (error == 0) {
- vnevent_create(ZTOV(zp), ct);
- }
- }
- }
-out:
getnewvnode_drop_reserve();
- if (dl)
- zfs_dirent_unlock(dl);
- if (error) {
- if (zp)
- VN_RELE(ZTOV(zp));
- } else {
+out:
+ if (error == 0) {
*vpp = ZTOV(zp);
- error = specvp_check(vpp, cr);
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
@@ -1932,57 +1912,30 @@ out:
* vp - ctime (if nlink > 0)
*/
-uint64_t null_xattr = 0;
-
/*ARGSUSED*/
static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
- int flags)
+zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
- znode_t *zp, *dzp = VTOZ(dvp);
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp = VTOZ(vp);
znode_t *xzp;
- vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t acl_obj, xattr_obj;
- uint64_t xattr_obj_unlinked = 0;
uint64_t obj = 0;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
- boolean_t may_delete_now, delete_now = FALSE;
boolean_t unlinked, toobig = FALSE;
uint64_t txtype;
- pathname_t *realnmp = NULL;
- pathname_t realnm;
int error;
- int zflg = ZEXISTS;
- boolean_t waited = B_FALSE;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
+ zp = VTOZ(vp);
- if (flags & FIGNORECASE) {
- zflg |= ZCILOOK;
- pn_alloc(&realnm);
- realnmp = &realnm;
- }
-
-top:
xattr_obj = 0;
xzp = NULL;
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, realnmp)) {
- if (realnmp)
- pn_free(realnmp);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vp = ZTOV(zp);
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
@@ -1998,14 +1951,15 @@ top:
vnevent_remove(vp, dvp, name, ct);
- if (realnmp)
- dnlc_remove(dvp, realnmp->pn_buf);
- else
- dnlc_remove(dvp, name);
+ obj = zp->z_id;
- VI_LOCK(vp);
- may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
- VI_UNLOCK(vp);
+ /* are there any extended attributes? */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT0(error);
+ }
/*
* We may delete the znode now, or we may put it in the unlinked set;
@@ -2013,35 +1967,17 @@ top:
* other holds on the vnode. So we dmu_tx_hold() the right things to
* allow for either case.
*/
- obj = zp->z_id;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
- if (may_delete_now) {
- toobig =
- zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
- /* if the file is too big, only hold_free a token amount */
- dmu_tx_hold_free(tx, zp->z_id, 0,
- (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
- }
- /* are there any extended attributes? */
- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj, sizeof (xattr_obj));
- if (error == 0 && xattr_obj) {
- error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT0(error);
+ if (xzp) {
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
- mutex_enter(&zp->z_lock);
- if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
- mutex_exit(&zp->z_lock);
-
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
@@ -2050,20 +1986,8 @@ top:
*/
dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (xzp)
- VN_RELE(ZTOV(xzp));
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- if (realnmp)
- pn_free(realnmp);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
@@ -2072,7 +1996,7 @@ top:
/*
* Remove the directory entry.
*/
- error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
if (error) {
dmu_tx_commit(tx);
@@ -2080,76 +2004,18 @@ top:
}
if (unlinked) {
- /*
- * Hold z_lock so that we can make sure that the ACL obj
- * hasn't changed. Could have been deleted due to
- * zfs_sa_upgrade().
- */
- mutex_enter(&zp->z_lock);
- VI_LOCK(vp);
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
- delete_now = may_delete_now && !toobig &&
- vp->v_count == 1 && !vn_has_cached_data(vp) &&
- xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
- acl_obj;
- VI_UNLOCK(vp);
- }
-
- if (delete_now) {
-#ifdef __FreeBSD__
- panic("zfs_remove: delete_now branch taken");
-#endif
- if (xattr_obj_unlinked) {
- ASSERT3U(xzp->z_links, ==, 2);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = 1;
- xzp->z_links = 0;
- error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
- &xzp->z_links, sizeof (xzp->z_links), tx);
- ASSERT3U(error, ==, 0);
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
-
- if (zp->z_is_sa)
- error = sa_remove(zp->z_sa_hdl,
- SA_ZPL_XATTR(zfsvfs), tx);
- else
- error = sa_update(zp->z_sa_hdl,
- SA_ZPL_XATTR(zfsvfs), &null_xattr,
- sizeof (uint64_t), tx);
- ASSERT0(error);
- }
- VI_LOCK(vp);
- vp->v_count--;
- ASSERT0(vp->v_count);
- VI_UNLOCK(vp);
- mutex_exit(&zp->z_lock);
- zfs_znode_delete(zp, tx);
- } else if (unlinked) {
- mutex_exit(&zp->z_lock);
zfs_unlinked_add(zp, tx);
-#ifdef __FreeBSD__
vp->v_vflag |= VV_NOSYNC;
-#endif
}
txtype = TX_REMOVE;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
dmu_tx_commit(tx);
out:
- if (realnmp)
- pn_free(realnmp);
-
- zfs_dirent_unlock(dl);
- if (!delete_now)
- VN_RELE(vp);
if (xzp)
- VN_RELE(ZTOV(xzp));
+ vrele(ZTOV(xzp));
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -2180,23 +2046,19 @@ out:
*/
/*ARGSUSED*/
static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
- caller_context_t *ct, int flags, vsecattr_t *vsecp)
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- zfs_dirlock_t *dl;
uint64_t txtype;
dmu_tx_t *tx;
int error;
- int zf = ZNEW;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
- boolean_t waited = B_FALSE;
ASSERT(vap->va_type == VDIR);
@@ -2211,7 +2073,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || (vap->va_mask & AT_XVATTR) ||
+ ((vap->va_mask & AT_XVATTR) ||
IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (SET_ERROR(EINVAL));
@@ -2229,8 +2091,6 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
@@ -2241,13 +2101,11 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
}
if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
- vsecp, &acl_ids)) != 0) {
+ NULL, &acl_ids)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
- getnewvnode_reserve(1);
-
/*
* First make sure the new directory doesn't exist.
*
@@ -2255,29 +2113,23 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
* EACCES instead of EEXIST which can cause some applications
* to fail.
*/
-top:
*vpp = NULL;
- if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
- NULL, NULL)) {
+ if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
zfs_acl_ids_free(&acl_ids);
- getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
+ ASSERT3P(zp, ==, NULL);
if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
@@ -2285,6 +2137,7 @@ top:
/*
* Add a new entry to the directory.
*/
+ getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
@@ -2299,15 +2152,8 @@ top:
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
@@ -2326,14 +2172,12 @@ top:
/*
* Now put new name in parent dir.
*/
- (void) zfs_link_create(dl, zp, tx, ZNEW);
+ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
*vpp = ZTOV(zp);
- txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+ txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
@@ -2342,8 +2186,6 @@ top:
getnewvnode_drop_reserve();
- zfs_dirent_unlock(dl);
-
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -2370,39 +2212,20 @@ top:
*/
/*ARGSUSED*/
static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
- caller_context_t *ct, int flags)
+zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
znode_t *dzp = VTOZ(dvp);
- znode_t *zp;
- vnode_t *vp;
+ znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
int error;
- int zflg = ZEXISTS;
- boolean_t waited = B_FALSE;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-top:
- zp = NULL;
-
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vp = ZTOV(zp);
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
@@ -2413,25 +2236,8 @@ top:
goto out;
}
- if (vp == cwd) {
- error = SET_ERROR(EINVAL);
- goto out;
- }
-
vnevent_rmdir(vp, dvp, name, ct);
- /*
- * Grab a lock on the directory to make sure that noone is
- * trying to add (or lookup) entries while we are removing it.
- */
- rw_enter(&zp->z_name_lock, RW_WRITER);
-
- /*
- * Grab a lock on the parent pointer to make sure we play well
- * with the treewalk and directory rename code.
- */
- rw_enter(&zp->z_parent_lock, RW_WRITER);
-
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -2439,48 +2245,26 @@ top:
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
-#ifdef FREEBSD_NAMECACHE
cache_purge(dvp);
-#endif
- error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
if (error == 0) {
uint64_t txtype = TX_RMDIR;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
}
dmu_tx_commit(tx);
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
-#ifdef FREEBSD_NAMECACHE
cache_purge(vp);
-#endif
out:
- zfs_dirent_unlock(dl);
-
- VN_RELE(vp);
-
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -2705,10 +2489,10 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
goto skip_entry;
if (!zfs_has_access(ezp, cr)) {
- VN_RELE(ZTOV(ezp));
+ vrele(ZTOV(ezp));
goto skip_entry;
}
- VN_RELE(ZTOV(ezp));
+ vrele(ZTOV(ezp));
}
if (flags & V_RDDIR_ENTFLAGS)
@@ -2905,7 +2689,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
* than to determine whether we were asked the question.
*/
- mutex_enter(&zp->z_lock);
vap->va_type = IFTOVT(zp->z_mode);
vap->va_mode = zp->z_mode & ~S_IFMT;
#ifdef illumos
@@ -3042,7 +2825,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
ZFS_TIME_DECODE(&vap->va_ctime, ctime);
ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
- mutex_exit(&zp->z_lock);
sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
vap->va_blksize = blksize;
@@ -3178,7 +2960,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
}
}
-top:
attrzp = NULL;
aclp = NULL;
@@ -3267,7 +3048,6 @@ top:
}
}
- mutex_enter(&zp->z_lock);
oldva.va_mode = zp->z_mode;
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) {
@@ -3341,7 +3121,6 @@ top:
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
- mutex_exit(&zp->z_lock);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
@@ -3353,8 +3132,6 @@ top:
}
}
- mutex_exit(&zp->z_lock);
-
if (mask & AT_MODE) {
if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
err = secpolicy_setid_setsticky_clear(vp, vap,
@@ -3429,7 +3206,7 @@ top:
if (new_uid != zp->z_uid &&
zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
if (attrzp)
- VN_RELE(ZTOV(attrzp));
+ vrele(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
@@ -3441,7 +3218,7 @@ top:
if (new_gid != zp->z_gid &&
zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
if (attrzp)
- VN_RELE(ZTOV(attrzp));
+ vrele(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
@@ -3463,7 +3240,6 @@ top:
if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
goto out;
- mutex_enter(&zp->z_lock);
if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
/*
* Are we upgrading ACL from old V0 format
@@ -3484,7 +3260,6 @@ top:
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
}
- mutex_exit(&zp->z_lock);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
} else {
if ((mask & AT_XVATTR) &&
@@ -3517,10 +3292,8 @@ top:
* updated as a side-effect of calling this function.
*/
-
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, sizeof (zp->z_pflags));
@@ -3528,7 +3301,6 @@ top:
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&attrzp->z_acl_lock);
- mutex_enter(&attrzp->z_lock);
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
sizeof (attrzp->z_pflags));
@@ -3662,14 +3434,12 @@ top:
if (mask != 0)
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
- mutex_exit(&zp->z_lock);
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&zp->z_acl_lock);
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&attrzp->z_acl_lock);
- mutex_exit(&attrzp->z_lock);
}
out:
if (err == 0 && attrzp) {
@@ -3679,7 +3449,7 @@ out:
}
if (attrzp)
- VN_RELE(ZTOV(attrzp));
+ vrele(ZTOV(attrzp));
if (aclp)
zfs_acl_free(aclp);
@@ -3691,8 +3461,6 @@ out:
if (err) {
dmu_tx_abort(tx);
- if (err == ERESTART)
- goto top;
} else {
err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
@@ -3706,101 +3474,236 @@ out2:
return (err);
}
-typedef struct zfs_zlock {
- krwlock_t *zl_rwlock; /* lock we acquired */
- znode_t *zl_znode; /* znode we held */
- struct zfs_zlock *zl_next; /* next in list */
-} zfs_zlock_t;
-
/*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
+ * We acquire all but fdvp locks using non-blocking acquisitions. If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename. This acquire/release step ensures that we do not
+ * spin on a lock waiting for release. On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
*/
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+ struct vnode *tdvp, struct vnode **tvpp,
+ const struct componentname *scnp, const struct componentname *tcnp)
{
- zfs_zlock_t *zl;
+ zfsvfs_t *zfsvfs;
+ struct vnode *nvp, *svp, *tvp;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ const char *snm = scnp->cn_nameptr;
+ const char *tnm = tcnp->cn_nameptr;
+ int error;
+
+ VOP_UNLOCK(tdvp, 0);
+ if (*tvpp != NULL && *tvpp != tdvp)
+ VOP_UNLOCK(*tvpp, 0);
- while ((zl = *zlpp) != NULL) {
- if (zl->zl_znode != NULL)
- VN_RELE(ZTOV(zl->zl_znode));
- rw_exit(zl->zl_rwlock);
- *zlpp = zl->zl_next;
- kmem_free(zl, sizeof (*zl));
+relock:
+ error = vn_lock(sdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ sdzp = VTOZ(sdvp);
+
+ error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp, 0);
+ if (error != EBUSY)
+ goto out;
+ error = vn_lock(tdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ VOP_UNLOCK(tdvp, 0);
+ goto relock;
}
-}
+ tdzp = VTOZ(tdvp);
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
- znode_t *zp = tdzp;
- uint64_t rootid = zp->z_zfsvfs->z_root;
- uint64_t oidp = zp->z_id;
- krwlock_t *rwlp = &szp->z_parent_lock;
- krw_t rw = RW_WRITER;
+ /*
+ * Before using sdzp and tdzp we must ensure that they are live.
+ * As a porting legacy from illumos we have two things to worry
+ * about. One is typical for FreeBSD and it is that the vnode is
+ * not reclaimed (doomed). The other is that the znode is live.
+ * The current code can invalidate the znode without acquiring the
+ * corresponding vnode lock if the object represented by the znode
+ * and vnode is no longer valid after a rollback or receive operation.
+ * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+ * that protects the znodes from the invalidation.
+ */
+ zfsvfs = sdzp->z_zfsvfs;
+ ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+ ZFS_ENTER(zfsvfs);
/*
- * First pass write-locks szp and compares to zp->z_id.
- * Later passes read-lock zp and compare to zp->z_parent.
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
*/
- do {
- if (!rw_tryenter(rwlp, rw)) {
- /*
- * Another thread is renaming in this path.
- * Note that if we are a WRITER, we don't have any
- * parent_locks held yet.
- */
- if (rw == RW_READER && zp->z_id > szp->z_id) {
- /*
- * Drop our locks and restart
- */
- zfs_rename_unlock(&zl);
- *zlpp = NULL;
- zp = tdzp;
- oidp = zp->z_id;
- rwlp = &szp->z_parent_lock;
- rw = RW_WRITER;
- continue;
- } else {
- /*
- * Wait for other thread to drop its locks
- */
- rw_enter(rwlp, rw);
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+
+ /*
+ * Re-resolve svp to be certain it still exists and fetch the
+ * correct vnode.
+ */
+ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+ if (error != 0) {
+ /* Source entry invalid or not there. */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+ (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ svp = ZTOV(szp);
+
+ /*
+ * Re-resolve tvp, if it disappeared we just carry on.
+ */
+ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ vrele(svp);
+ if ((tcnp->cn_flags & ISDOTDOT) != 0)
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ if (tzp != NULL)
+ tvp = ZTOV(tzp);
+ else
+ tvp = NULL;
+
+ /*
+ * At present the vnode locks must be acquired before z_teardown_lock,
+ * although it would be more logical to use the opposite order.
+ */
+ ZFS_EXIT(zfsvfs);
+
+ /*
+ * Now try acquire locks on svp and tvp.
+ */
+ nvp = svp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ if (tvp != NULL)
+ vrele(tvp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ VOP_UNLOCK(nvp, 0);
+ /*
+ * Concurrent rename race.
+ * XXX ?
+ */
+ if (nvp == tdvp) {
+ vrele(nvp);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+ goto relock;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+
+ if (*tvpp != NULL)
+ vrele(*tvpp);
+ *tvpp = NULL;
+ if (tvp != NULL) {
+ nvp = tvp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ VOP_UNLOCK(*svpp, 0);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
}
+ vput(nvp);
+ goto relock;
}
+ *tvpp = nvp;
+ }
- zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
- zl->zl_rwlock = rwlp;
- zl->zl_znode = NULL;
- zl->zl_next = *zlpp;
- *zlpp = zl;
+ return (0);
- if (oidp == szp->z_id) /* We're a descendant of szp */
- return (SET_ERROR(EINVAL));
+out:
+ return (error);
+}
- if (oidp == rootid) /* We've hit the top */
- return (0);
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *zp, *zp1;
+ uint64_t parent;
+ int error;
- if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
- if (error)
- return (error);
- zl->zl_znode = zp;
+ zfsvfs = tdzp->z_zfsvfs;
+ if (tdzp == szp)
+ return (SET_ERROR(EINVAL));
+ if (tdzp == sdzp)
+ return (0);
+ if (tdzp->z_id == zfsvfs->z_root)
+ return (0);
+ zp = tdzp;
+ for (;;) {
+ ASSERT(!zp->z_unlinked);
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ break;
+
+ if (parent == szp->z_id) {
+ error = SET_ERROR(EINVAL);
+ break;
}
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
- &oidp, sizeof (oidp));
- rwlp = &zp->z_parent_lock;
- rw = RW_READER;
+ if (parent == zfsvfs->z_root)
+ break;
+ if (parent == sdzp->z_id)
+ break;
- } while (zp->z_id != sdzp->z_id);
+ error = zfs_zget(zfsvfs, parent, &zp1);
+ if (error != 0)
+ break;
- return (0);
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ zp = zp1;
+ }
+
+ if (error == ENOTDIR)
+ panic("checkpath: .. not a directory\n");
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ return (error);
}
/*
@@ -3822,187 +3725,93 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
*/
/*ARGSUSED*/
static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
- caller_context_t *ct, int flags)
+zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+ vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+ cred_t *cr)
{
- znode_t *tdzp, *sdzp, *szp, *tzp;
- zfsvfs_t *zfsvfs;
- zilog_t *zilog;
- vnode_t *realvp;
- zfs_dirlock_t *sdl, *tdl;
+ zfsvfs_t *zfsvfs;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ zilog_t *zilog = NULL;
dmu_tx_t *tx;
- zfs_zlock_t *zl;
- int cmp, serr, terr;
+ char *snm = scnp->cn_nameptr;
+ char *tnm = tcnp->cn_nameptr;
int error = 0;
- int zflg = 0;
- boolean_t waited = B_FALSE;
- tdzp = VTOZ(tdvp);
- ZFS_VERIFY_ZP(tdzp);
- zfsvfs = tdzp->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
- zilog = zfsvfs->z_log;
- sdzp = VTOZ(sdvp);
+ /* Reject renames across filesystems. */
+ if ((*svpp)->v_mount != tdvp->v_mount ||
+ ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ if (zfsctl_is_node(tdvp)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
/*
- * In case sdzp is not valid, let's be sure to exit from the right
- * zfsvfs_t.
+ * Lock all four vnodes to ensure safety and semantics of renaming.
*/
- if (sdzp->z_sa_hdl == NULL) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EIO));
+ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+ if (error != 0) {
+ /* no vnodes are locked in the case of error here */
+ return (error);
}
+ tdzp = VTOZ(tdvp);
+ sdzp = VTOZ(sdvp);
+ zfsvfs = tdzp->z_zfsvfs;
+ zilog = zfsvfs->z_log;
+
/*
- * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
- * ctldir appear to have the same v_vfsp.
+ * After we re-enter ZFS_ENTER() we will have to revalidate all
+ * znodes involved.
*/
- if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EXDEV));
- }
+ ZFS_ENTER(zfsvfs);
if (zfsvfs->z_utf8 && u8_validate(tnm,
strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
+ error = SET_ERROR(EILSEQ);
+ goto unlockout;
}
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-
-top:
- szp = NULL;
- tzp = NULL;
- zl = NULL;
-
- /*
- * This is to prevent the creation of links into attribute space
- * by renaming a linked file into/outof an attribute directory.
- * See the comment in zfs_link() for why this is considered bad.
- */
- if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
+ /* If source and target are the same file, there is nothing to do. */
+ if ((*svpp) == (*tvpp)) {
+ error = 0;
+ goto unlockout;
}
- /*
- * Lock source and target directory entries. To prevent deadlock,
- * a lock ordering must be defined. We lock the directory with
- * the smallest object id first, or if it's a tie, the one with
- * the lexically first name.
- */
- if (sdzp->z_id < tdzp->z_id) {
- cmp = -1;
- } else if (sdzp->z_id > tdzp->z_id) {
- cmp = 1;
- } else {
- /*
- * First compare the two name arguments without
- * considering any case folding.
- */
- int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
-
- cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
- ASSERT(error == 0 || !zfsvfs->z_utf8);
- if (cmp == 0) {
- /*
- * POSIX: "If the old argument and the new argument
- * both refer to links to the same existing file,
- * the rename() function shall return successfully
- * and perform no other action."
- */
- ZFS_EXIT(zfsvfs);
- return (0);
- }
- /*
- * If the file system is case-folding, then we may
- * have some more checking to do. A case-folding file
- * system is either supporting mixed case sensitivity
- * access or is completely case-insensitive. Note
- * that the file system is always case preserving.
- *
- * In mixed sensitivity mode case sensitive behavior
- * is the default. FIGNORECASE must be used to
- * explicitly request case insensitive behavior.
- *
- * If the source and target names provided differ only
- * by case (e.g., a request to rename 'tim' to 'Tim'),
- * we will treat this as a special case in the
- * case-insensitive mode: as long as the source name
- * is an exact match, we will allow this to proceed as
- * a name-change request.
- */
- if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
- (zfsvfs->z_case == ZFS_CASE_MIXED &&
- flags & FIGNORECASE)) &&
- u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
- &error) == 0) {
- /*
- * case preserving rename request, require exact
- * name matches
- */
- zflg |= ZCIEXACT;
- zflg &= ~ZCILOOK;
- }
+ if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+ ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+ (*tvpp)->v_mountedhere != NULL)) {
+ error = SET_ERROR(EXDEV);
+ goto unlockout;
}
/*
- * If the source and destination directories are the same, we should
- * grab the z_name_lock of that directory only once.
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
*/
- if (sdzp == tdzp) {
- zflg |= ZHAVELOCK;
- rw_enter(&sdzp->z_name_lock, RW_READER);
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
}
- if (cmp < 0) {
- serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
- ZEXISTS | zflg, NULL, NULL);
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
- } else {
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, zflg, NULL, NULL);
- serr = zfs_dirent_lock(&sdl,
- sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
- NULL, NULL);
+ szp = VTOZ(*svpp);
+ tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+ if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
}
- if (serr) {
- /*
- * Source entry invalid or not there.
- */
- if (!terr) {
- zfs_dirent_unlock(tdl);
- if (tzp)
- VN_RELE(ZTOV(tzp));
- }
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- /*
- * FreeBSD: In OpenSolaris they only check if rename source is
- * ".." here, because "." is handled in their lookup. This is
- * not the case for FreeBSD, so we check for "." explicitly.
- */
- if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
- serr = SET_ERROR(EINVAL);
- ZFS_EXIT(zfsvfs);
- return (serr);
- }
- if (terr) {
- zfs_dirent_unlock(sdl);
- VN_RELE(ZTOV(szp));
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- if (strcmp(tnm, "..") == 0)
- terr = SET_ERROR(EINVAL);
- ZFS_EXIT(zfsvfs);
- return (terr);
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+ error = SET_ERROR(EINVAL);
+ goto unlockout;
}
/*
@@ -4011,17 +3820,26 @@ top:
* Note that if target and source are the same, this can be
* done in a single check.
*/
-
if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
- goto out;
+ goto unlockout;
+
+ if ((*svpp)->v_type == VDIR) {
+ /*
+ * Avoid ".", "..", and aliases of "." for obvious reasons.
+ */
+ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+ sdzp == szp ||
+ (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+ error = EINVAL;
+ goto unlockout;
+ }
- if (ZTOV(szp)->v_type == VDIR) {
/*
* Check to make sure rename is valid.
* Can't do a move like this: /usr/a/b to /usr/a/b/c/d
*/
- if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
- goto out;
+ if (error = zfs_rename_check(szp, sdzp, tdzp))
+ goto unlockout;
}
/*
@@ -4031,31 +3849,26 @@ top:
/*
* Source and target must be the same type.
*/
- if (ZTOV(szp)->v_type == VDIR) {
- if (ZTOV(tzp)->v_type != VDIR) {
+ if ((*svpp)->v_type == VDIR) {
+ if ((*tvpp)->v_type != VDIR) {
error = SET_ERROR(ENOTDIR);
- goto out;
+ goto unlockout;
+ } else {
+ cache_purge(tdvp);
+ if (sdvp != tdvp)
+ cache_purge(sdvp);
}
} else {
- if (ZTOV(tzp)->v_type == VDIR) {
+ if ((*tvpp)->v_type == VDIR) {
error = SET_ERROR(EISDIR);
- goto out;
+ goto unlockout;
}
}
- /*
- * POSIX dictates that when the source and target
- * entries refer to the same file object, rename
- * must do nothing and exit without error.
- */
- if (szp->z_id == tzp->z_id) {
- error = 0;
- goto out;
- }
}
- vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+ vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
if (tzp)
- vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+ vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
/*
* notify the target directory if it is not the same
@@ -4081,35 +3894,18 @@ top:
zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- if (zl != NULL)
- zfs_rename_unlock(&zl);
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
+ goto unlockout;
}
+
if (tzp) /* Attempt to remove the existing target */
- error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
if (error == 0) {
- error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
if (error == 0) {
szp->z_pflags |= ZFS_AV_MODIFIED;
@@ -4117,17 +3913,16 @@ top:
(void *)&szp->z_pflags, sizeof (uint64_t), tx);
ASSERT0(error);
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+ NULL);
if (error == 0) {
- zfs_log_rename(zilog, tx, TX_RENAME |
- (flags & FIGNORECASE ? TX_CI : 0), sdzp,
- sdl->dl_name, tdzp, tdl->dl_name, szp);
+ zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+ snm, tdzp, tnm, szp);
/*
* Update path information for the target vnode
*/
- vn_renamepath(tdvp, ZTOV(szp), tnm,
- strlen(tnm));
+ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
} else {
/*
* At this point, we have successfully created
@@ -4141,42 +3936,33 @@ top:
* succeed; fortunately, it is very unlikely to
* fail, since we just created it.
*/
- VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
ZRENAMING, NULL), ==, 0);
}
}
-#ifdef FREEBSD_NAMECACHE
if (error == 0) {
- cache_purge(sdvp);
- cache_purge(tdvp);
- cache_purge(ZTOV(szp));
- if (tzp)
- cache_purge(ZTOV(tzp));
+ cache_purge(*svpp);
+ if (*tvpp != NULL)
+ cache_purge(*tvpp);
+ cache_purge_negative(tdvp);
}
-#endif
}
dmu_tx_commit(tx);
-out:
- if (zl != NULL)
- zfs_rename_unlock(&zl);
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
-
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
+unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(*svpp, 0);
+ VOP_UNLOCK(sdvp, 0);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+out: /* original two vnodes are locked */
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
-
+ if (*tvpp != NULL)
+ VOP_UNLOCK(*tvpp, 0);
+ if (tdvp != *tvpp)
+ VOP_UNLOCK(tdvp, 0);
return (error);
}
@@ -4201,17 +3987,14 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
cred_t *cr, kthread_t *td)
{
znode_t *zp, *dzp = VTOZ(dvp);
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t len = strlen(link);
int error;
- int zflg = ZNEW;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
uint64_t txtype = TX_SYMLINK;
- boolean_t waited = B_FALSE;
int flags = 0;
ASSERT(vap->va_type == VLNK);
@@ -4225,8 +4008,6 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
if (len > MAXPATHLEN) {
ZFS_EXIT(zfsvfs);
@@ -4239,35 +4020,29 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
return (error);
}
- getnewvnode_reserve(1);
-
-top:
/*
* Attempt to lock directory; fail if entry already exists.
*/
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
if (error) {
zfs_acl_ids_free(&acl_ids);
- getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
+
+ getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
@@ -4281,15 +4056,8 @@ top:
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
@@ -4306,13 +4074,11 @@ top:
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- mutex_enter(&zp->z_lock);
if (zp->z_is_sa)
error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
link, len, tx);
else
zfs_sa_symlink(zp, link, len, tx);
- mutex_exit(&zp->z_lock);
zp->z_size = len;
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
@@ -4320,10 +4086,8 @@ top:
/*
* Insert the new object into the directory.
*/
- (void) zfs_link_create(dl, zp, tx, ZNEW);
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
*vpp = ZTOV(zp);
@@ -4333,8 +4097,6 @@ top:
getnewvnode_drop_reserve();
- zfs_dirent_unlock(dl);
-
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -4369,13 +4131,11 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- mutex_enter(&zp->z_lock);
if (zp->z_is_sa)
error = sa_lookup_uio(zp->z_sa_hdl,
SA_ZPL_SYMLINK(zfsvfs), uio);
else
error = zfs_sa_readlink(zp, uio);
- mutex_exit(&zp->z_lock);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
@@ -4407,14 +4167,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
znode_t *tzp, *szp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
- vnode_t *realvp;
int error;
- int zf = ZNEW;
uint64_t parent;
uid_t owner;
- boolean_t waited = B_FALSE;
ASSERT(tdvp->v_type == VDIR);
@@ -4422,9 +4178,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
- if (VOP_REALVP(svp, &realvp, ct) == 0)
- svp = realvp;
-
/*
* POSIX dictates that we return EPERM here.
* Better choices include ENOTSUP or EISDIR.
@@ -4442,15 +4195,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
return (SET_ERROR(EPERM));
}
- /*
- * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
- * ctldir appear to have the same v_vfsp.
- */
- if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EXDEV));
- }
-
/* Prevent links to .zfs/shares files */
if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
@@ -4468,8 +4212,6 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
/*
* We do not support links between attributes and non-attributes
@@ -4494,11 +4236,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
return (error);
}
-top:
/*
* Attempt to lock directory; fail if entry already exists.
*/
- error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
@@ -4509,33 +4250,22 @@ top:
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
zfs_sa_upgrade_txholds(tx, szp);
zfs_sa_upgrade_txholds(tx, dzp);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
- error = zfs_link_create(dl, szp, tx, 0);
+ error = zfs_link_create(dzp, name, szp, tx, 0);
if (error == 0) {
uint64_t txtype = TX_LINK;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_link(zilog, tx, txtype, dzp, szp, name);
}
dmu_tx_commit(tx);
- zfs_dirent_unlock(dl);
-
if (error == 0) {
vnevent_link(svp, ct);
}
@@ -4547,235 +4277,6 @@ top:
return (error);
}
-#ifdef illumos
-/*
- * zfs_null_putapage() is used when the file system has been force
- * unmounted. It just drops the pages.
- */
-/* ARGSUSED */
-static int
-zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
-{
- pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
- return (0);
-}
-
-/*
- * Push a page out to disk, klustering if possible.
- *
- * IN: vp - file to push page to.
- * pp - page to push.
- * flags - additional flags.
- * cr - credentials of caller.
- *
- * OUT: offp - start of range pushed.
- * lenp - len of range pushed.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * NOTE: callers must have locked the page to be pushed. On
- * exit, the page (and all other pages in the kluster) must be
- * unlocked.
- */
-/* ARGSUSED */
-static int
-zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- dmu_tx_t *tx;
- u_offset_t off, koff;
- size_t len, klen;
- int err;
-
- off = pp->p_offset;
- len = PAGESIZE;
- /*
- * If our blocksize is bigger than the page size, try to kluster
- * multiple pages so that we write a full block (thus avoiding
- * a read-modify-write).
- */
- if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
- klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
- koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
- ASSERT(koff <= zp->z_size);
- if (koff + klen > zp->z_size)
- klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
- pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
- }
- ASSERT3U(btop(len), ==, btopr(len));
-
- /*
- * Can't push pages past end-of-file.
- */
- if (off >= zp->z_size) {
- /* ignore all pages */
- err = 0;
- goto out;
- } else if (off + len > zp->z_size) {
- int npages = btopr(zp->z_size - off);
- page_t *trunc;
-
- page_list_break(&pp, &trunc, npages);
- /* ignore pages past end of file */
- if (trunc)
- pvn_write_done(trunc, flags);
- len = zp->z_size - off;
- }
-
- if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
- zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
- err = SET_ERROR(EDQUOT);
- goto out;
- }
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_write(tx, zp->z_id, off, len);
-
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err != 0) {
- dmu_tx_abort(tx);
- goto out;
- }
-
- if (zp->z_blksz <= PAGESIZE) {
- caddr_t va = zfs_map_page(pp, S_READ);
- ASSERT3U(len, <=, PAGESIZE);
- dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
- zfs_unmap_page(pp, va);
- } else {
- err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
- }
-
- if (err == 0) {
- uint64_t mtime[2], ctime[2];
- sa_bulk_attr_t bulk[3];
- int count = 0;
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
- &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
- &ctime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, 8);
- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
- B_TRUE);
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
- }
- dmu_tx_commit(tx);
-
-out:
- pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
- if (offp)
- *offp = off;
- if (lenp)
- *lenp = len;
-
- return (err);
-}
-
-/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
- *
- * IN: vp - vnode of file to push page data to.
- * off - position in file to put data.
- * len - amount of data to write.
- * flags - flags to control the operation.
- * cr - credentials of caller.
- * ct - caller context.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t *pp;
- size_t io_len;
- u_offset_t io_off;
- uint_t blksz;
- rl_t *rl;
- int error = 0;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /*
- * Align this request to the file block size in case we kluster.
- * XXX - this can result in pretty aggresive locking, which can
- * impact simultanious read/write access. One option might be
- * to break up long requests (len == 0) into block-by-block
- * operations to get narrower locking.
- */
- blksz = zp->z_blksz;
- if (ISP2(blksz))
- io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
- else
- io_off = 0;
- if (len > 0 && ISP2(blksz))
- io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
- else
- io_len = 0;
-
- if (io_len == 0) {
- /*
- * Search the entire vp list for pages >= io_off.
- */
- rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
- error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
- goto out;
- }
- rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
-
- if (off > zp->z_size) {
- /* past end of file */
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
-
- for (off = io_off; io_off < off + len; io_off += io_len) {
- if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
- pp = page_lookup(vp, io_off,
- (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
- } else {
- pp = page_lookup_nowait(vp, io_off,
- (flags & B_FREE) ? SE_EXCL : SE_SHARED);
- }
-
- if (pp != NULL && pvn_getdirty(pp, flags)) {
- int err;
-
- /*
- * Found a dirty page to push
- */
- err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
- if (err)
- error = err;
- } else {
- io_len = PAGESIZE;
- }
- }
-out:
- zfs_range_unlock(rl);
- if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-#endif /* illumos */
/*ARGSUSED*/
void
@@ -4796,17 +4297,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
return;
}
- mutex_enter(&zp->z_lock);
if (zp->z_unlinked) {
/*
* Fast path to recycle a vnode of a removed file.
*/
- mutex_exit(&zp->z_lock);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
vrecycle(vp);
return;
}
- mutex_exit(&zp->z_lock);
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
@@ -4817,444 +4315,15 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
if (error) {
dmu_tx_abort(tx);
} else {
- mutex_enter(&zp->z_lock);
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
(void *)&zp->z_atime, sizeof (zp->z_atime), tx);
zp->z_atime_dirty = 0;
- mutex_exit(&zp->z_lock);
dmu_tx_commit(tx);
}
}
rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
-#ifdef illumos
-/*
- * Bounds-check the seek operation.
- *
- * IN: vp - vnode seeking within
- * ooff - old file offset
- * noffp - pointer to new file offset
- * ct - caller context
- *
- * RETURN: 0 on success, EINVAL if new offset invalid.
- */
-/* ARGSUSED */
-static int
-zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
- caller_context_t *ct)
-{
- if (vp->v_type == VDIR)
- return (0);
- return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
-
-/*
- * Pre-filter the generic locking function to trap attempts to place
- * a mandatory lock on a memory mapped file.
- */
-static int
-zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
- flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /*
- * We are following the UFS semantics with respect to mapcnt
- * here: If we see that the file is mapped already, then we will
- * return an error, but we don't worry about races between this
- * function and zfs_map().
- */
- if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EAGAIN));
- }
- ZFS_EXIT(zfsvfs);
- return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
-}
-
-/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data. For efficiency, we may try to fill
- * multiple pages at once (klustering) to fill up the supplied page
- * list. Note that the pages to be filled are held with an exclusive
- * lock to prevent access by other threads while they are being filled.
- */
-static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
- caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
-{
- znode_t *zp = VTOZ(vp);
- page_t *pp, *cur_pp;
- objset_t *os = zp->z_zfsvfs->z_os;
- u_offset_t io_off, total;
- size_t io_len;
- int err;
-
- if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
- /*
- * We only have a single page, don't bother klustering
- */
- io_off = off;
- io_len = PAGESIZE;
- pp = page_create_va(vp, io_off, io_len,
- PG_EXCL | PG_WAIT, seg, addr);
- } else {
- /*
- * Try to find enough pages to fill the page list
- */
- pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
- &io_len, off, plsz, 0);
- }
- if (pp == NULL) {
- /*
- * The page already exists, nothing to do here.
- */
- *pl = NULL;
- return (0);
- }
-
- /*
- * Fill the pages in the kluster.
- */
- cur_pp = pp;
- for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
- caddr_t va;
-
- ASSERT3U(io_off, ==, cur_pp->p_offset);
- va = zfs_map_page(cur_pp, S_WRITE);
- err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
- DMU_READ_PREFETCH);
- zfs_unmap_page(cur_pp, va);
- if (err) {
- /* On error, toss the entire kluster */
- pvn_read_done(pp, B_ERROR);
- /* convert checksum errors into IO errors */
- if (err == ECKSUM)
- err = SET_ERROR(EIO);
- return (err);
- }
- cur_pp = cur_pp->p_next;
- }
-
- /*
- * Fill in the page list array from the kluster starting
- * from the desired offset `off'.
- * NOTE: the page list will always be null terminated.
- */
- pvn_plist_init(pp, pl, plsz, off, io_len, rw);
- ASSERT(pl == NULL || (*pl)->p_offset == off);
-
- return (0);
-}
-
-/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array. If plsz is greater than len, this function may
- * also return page pointers from after the specified region
- * (i.e. the region [off, off + plsz]). These additional pages are
- * only returned if they are already in the cache, or were created as
- * part of a klustered read.
- *
- * IN: vp - vnode of file to get data from.
- * off - position in file to get data from.
- * len - amount of data to retrieve.
- * plsz - length of provided page list.
- * seg - segment to obtain pages for.
- * addr - virtual address of fault.
- * rw - mode of created pages.
- * cr - credentials of caller.
- * ct - caller context.
- *
- * OUT: protp - protection mode of created pages.
- * pl - list of pages created.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
- page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
- enum seg_rw rw, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t **pl0 = pl;
- int err = 0;
-
- /* we do our own caching, faultahead is unnecessary */
- if (pl == NULL)
- return (0);
- else if (len > plsz)
- len = plsz;
- else
- len = P2ROUNDUP(len, PAGESIZE);
- ASSERT(plsz >= len);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (protp)
- *protp = PROT_ALL;
-
- /*
- * Loop through the requested range [off, off + len) looking
- * for pages. If we don't find a page, we will need to create
- * a new page and fill it with data from the file.
- */
- while (len > 0) {
- if (*pl = page_lookup(vp, off, SE_SHARED))
- *(pl+1) = NULL;
- else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
- goto out;
- while (*pl) {
- ASSERT3U((*pl)->p_offset, ==, off);
- off += PAGESIZE;
- addr += PAGESIZE;
- if (len > 0) {
- ASSERT3U(len, >=, PAGESIZE);
- len -= PAGESIZE;
- }
- ASSERT3U(plsz, >=, PAGESIZE);
- plsz -= PAGESIZE;
- pl++;
- }
- }
-
- /*
- * Fill out the page array with any pages already in the cache.
- */
- while (plsz > 0 &&
- (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
- off += PAGESIZE;
- plsz -= PAGESIZE;
- }
-out:
- if (err) {
- /*
- * Release any pages we have previously locked.
- */
- while (pl > pl0)
- page_unlock(*--pl);
- } else {
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- }
-
- *pl = NULL;
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-/*
- * Request a memory map for a section of a file. This code interacts
- * with common code and the VM system as follows:
- *
- * - common code calls mmap(), which ends up in smmap_common()
- * - this calls VOP_MAP(), which takes you into (say) zfs
- * - zfs_map() calls as_map(), passing segvn_create() as the callback
- * - segvn_create() creates the new segment and calls VOP_ADDMAP()
- * - zfs_addmap() updates z_mapcnt
- */
-/*ARGSUSED*/
-static int
-zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
- size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- segvn_crargs_t vn_a;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if ((prot & PROT_WRITE) && (zp->z_pflags &
- (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- if ((prot & (PROT_READ | PROT_EXEC)) &&
- (zp->z_pflags & ZFS_AV_QUARANTINED)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EACCES));
- }
-
- if (vp->v_flag & VNOMAP) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOSYS));
- }
-
- if (off < 0 || len > MAXOFFSET_T - off) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENXIO));
- }
-
- if (vp->v_type != VREG) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENODEV));
- }
-
- /*
- * If file is locked, disallow mapping.
- */
- if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EAGAIN));
- }
-
- as_rangelock(as);
- error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
- if (error != 0) {
- as_rangeunlock(as);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vn_a.vp = vp;
- vn_a.offset = (u_offset_t)off;
- vn_a.type = flags & MAP_TYPE;
- vn_a.prot = prot;
- vn_a.maxprot = maxprot;
- vn_a.cred = cr;
- vn_a.amp = NULL;
- vn_a.flags = flags & ~MAP_TYPE;
- vn_a.szc = 0;
- vn_a.lgrp_mem_policy_flags = 0;
-
- error = as_map(as, *addrp, len, segvn_create, &vn_a);
-
- as_rangeunlock(as);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
- size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- uint64_t pages = btopr(len);
-
- atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
- return (0);
-}
-
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file. Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics. If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed. The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * open()
- * mmap()
- * <modify memory>
- * munmap()
- * close()
- * <time lapse>
- * putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future. In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
-/* ARGSUSED */
-static int
-zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
- size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- uint64_t pages = btopr(len);
-
- ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
- atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
-
- if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
- vn_has_cached_data(vp))
- (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
- return (0);
-}
-
-/*
- * Free or allocate space in a file. Currently, this function only
- * supports the `F_FREESP' command. However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- * IN: vp - vnode of file to free data in.
- * cmd - action to take (only F_FREESP supported).
- * bfp - section of file to free/alloc.
- * flag - current file open mode flags.
- * offset - current file offset.
- * cr - credentials of caller [UNUSED].
- * ct - caller context.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - ctime|mtime updated
- */
-/* ARGSUSED */
-static int
-zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
- offset_t offset, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t off, len;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (cmd != F_FREESP) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
- * callers might not be able to detect properly that we are read-only,
- * so check it explicitly here.
- */
- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EROFS));
- }
-
- if (error = convoff(vp, bfp, 0, offset)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (bfp->l_len < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- off = bfp->l_start;
- len = bfp->l_len; /* 0 means from off to end of file */
-
- error = zfs_freesp(zp, off, len, flag, TRUE);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-#endif /* illumos */
CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
@@ -5331,7 +4400,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
{
znode_t *zp, *xzp;
zfsvfs_t *zfsvfs;
- zfs_dirlock_t *dl;
int error;
switch (cmd) {
@@ -5349,13 +4417,12 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
*valp = 0;
- error = zfs_dirent_lock(&dl, zp, "", &xzp,
- ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
+ error = zfs_dirent_lookup(zp, "", &xzp,
+ ZXATTR | ZEXISTS | ZSHARED);
if (error == 0) {
- zfs_dirent_unlock(dl);
if (!zfs_dirempty(xzp))
*valp = 1;
- VN_RELE(ZTOV(xzp));
+ vrele(ZTOV(xzp));
} else if (error == ENOENT) {
/*
* If there aren't extended attributes, it's the
@@ -5448,339 +4515,6 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
return (error);
}
-#ifdef illumos
-/*
- * The smallest read we may consider to loan out an arcbuf.
- * This must be a power of 2.
- */
-int zcr_blksz_min = (1 << 10); /* 1K */
-/*
- * If set to less than the file block size, allow loaning out of an
- * arcbuf for a partial block read. This must be a power of 2.
- */
-int zcr_blksz_max = (1 << 17); /* 128K */
-
-/*ARGSUSED*/
-static int
-zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int max_blksz = zfsvfs->z_max_blksz;
- uio_t *uio = &xuio->xu_uio;
- ssize_t size = uio->uio_resid;
- offset_t offset = uio->uio_loffset;
- int blksz;
- int fullblk, i;
- arc_buf_t *abuf;
- ssize_t maxsize;
- int preamble, postamble;
-
- if (xuio->xu_type != UIOTYPE_ZEROCOPY)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- switch (ioflag) {
- case UIO_WRITE:
- /*
- * Loan out an arc_buf for write if write size is bigger than
- * max_blksz, and the file's block size is also max_blksz.
- */
- blksz = max_blksz;
- if (size < blksz || zp->z_blksz != blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- /*
- * Caller requests buffers for write before knowing where the
- * write offset might be (e.g. NFS TCP write).
- */
- if (offset == -1) {
- preamble = 0;
- } else {
- preamble = P2PHASE(offset, blksz);
- if (preamble) {
- preamble = blksz - preamble;
- size -= preamble;
- }
- }
-
- postamble = P2PHASE(size, blksz);
- size -= postamble;
-
- fullblk = size / blksz;
- (void) dmu_xuio_init(xuio,
- (preamble != 0) + fullblk + (postamble != 0));
- DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
- int, postamble, int,
- (preamble != 0) + fullblk + (postamble != 0));
-
- /*
- * Have to fix iov base/len for partial buffers. They
- * currently represent full arc_buf's.
- */
- if (preamble) {
- /* data begins in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf,
- blksz - preamble, preamble);
- }
-
- for (i = 0; i < fullblk; i++) {
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, blksz);
- }
-
- if (postamble) {
- /* data ends in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, postamble);
- }
- break;
- case UIO_READ:
- /*
- * Loan out an arc_buf for read if the read size is larger than
- * the current file block size. Block alignment is not
- * considered. Partial arc_buf will be loaned out for read.
- */
- blksz = zp->z_blksz;
- if (blksz < zcr_blksz_min)
- blksz = zcr_blksz_min;
- if (blksz > zcr_blksz_max)
- blksz = zcr_blksz_max;
- /* avoid potential complexity of dealing with it */
- if (blksz > max_blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- maxsize = zp->z_size - uio->uio_loffset;
- if (size > maxsize)
- size = maxsize;
-
- if (size < blksz || vn_has_cached_data(vp)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- break;
- default:
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- uio->uio_extflg = UIO_XUIO;
- XUIO_XUZC_RW(xuio) = ioflag;
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
-{
- int i;
- arc_buf_t *abuf;
- int ioflag = XUIO_XUZC_RW(xuio);
-
- ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
- i = dmu_xuio_cnt(xuio);
- while (i-- > 0) {
- abuf = dmu_xuio_arcbuf(xuio, i);
- /*
- * if abuf == NULL, it must be a write buffer
- * that has been returned in zfs_write().
- */
- if (abuf)
- dmu_return_arcbuf(abuf);
- ASSERT(abuf || ioflag == UIO_WRITE);
- }
-
- dmu_xuio_fini(xuio);
- return (0);
-}
-
-/*
- * Predeclare these here so that the compiler assumes that
- * this is an "old style" function declaration that does
- * not include arguments => we won't get type mismatch errors
- * in the initializations that follow.
- */
-static int zfs_inval();
-static int zfs_isdir();
-
-static int
-zfs_inval()
-{
- return (SET_ERROR(EINVAL));
-}
-
-static int
-zfs_isdir()
-{
- return (SET_ERROR(EISDIR));
-}
-/*
- * Directory vnode operations template
- */
-vnodeops_t *zfs_dvnodeops;
-const fs_operation_def_t zfs_dvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_READ, { .error = zfs_isdir },
- VOPNAME_WRITE, { .error = zfs_isdir },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_CREATE, { .vop_create = zfs_create },
- VOPNAME_REMOVE, { .vop_remove = zfs_remove },
- VOPNAME_LINK, { .vop_link = zfs_link },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
- VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
- VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
- VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Regular file vnode operations template
- */
-vnodeops_t *zfs_fvnodeops;
-const fs_operation_def_t zfs_fvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_READ, { .vop_read = zfs_read },
- VOPNAME_WRITE, { .vop_write = zfs_write },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
- VOPNAME_SPACE, { .vop_space = zfs_space },
- VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
- VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
- VOPNAME_MAP, { .vop_map = zfs_map },
- VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
- VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
- VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
- NULL, NULL
-};
-
-/*
- * Symbolic link vnode operations template
- */
-vnodeops_t *zfs_symvnodeops;
-const fs_operation_def_t zfs_symvnodeops_template[] = {
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * special share hidden files vnode operations template
- */
-vnodeops_t *zfs_sharevnodeops;
-const fs_operation_def_t zfs_sharevnodeops_template[] = {
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Extended attribute directory vnode operations template
- *
- * This template is identical to the directory vnodes
- * operation template except for restricted operations:
- * VOP_MKDIR()
- * VOP_SYMLINK()
- *
- * Note that there are other restrictions embedded in:
- * zfs_create() - restrict type to VREG
- * zfs_link() - no links into/out of attribute space
- * zfs_rename() - no moves into/out of attribute space
- */
-vnodeops_t *zfs_xdvnodeops;
-const fs_operation_def_t zfs_xdvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_CREATE, { .vop_create = zfs_create },
- VOPNAME_REMOVE, { .vop_remove = zfs_remove },
- VOPNAME_LINK, { .vop_link = zfs_link },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_MKDIR, { .error = zfs_inval },
- VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
- VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
- VOPNAME_SYMLINK, { .error = zfs_inval },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Error vnode operations template
- */
-vnodeops_t *zfs_evnodeops;
-const fs_operation_def_t zfs_evnodeops_template[] = {
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- NULL, NULL
-};
-#endif /* illumos */
-
static int
ioflags(int ioflags)
{
@@ -5789,7 +4523,7 @@ ioflags(int ioflags)
if (ioflags & IO_APPEND)
flags |= FAPPEND;
if (ioflags & IO_NDELAY)
- flags |= FNONBLOCK;
+ flags |= FNONBLOCK;
if (ioflags & IO_SYNC)
flags |= (FSYNC | FDSYNC | FRSYNC);
@@ -6257,6 +4991,23 @@ zfs_freebsd_lookup(ap)
}
static int
+zfs_cache_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ zfsvfs_t *zfsvfs;
+
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+ if (zfsvfs->z_use_namecache)
+ return (vfs_cache_lookup(ap));
+ else
+ return (zfs_freebsd_lookup(ap));
+}
+
+static int
zfs_freebsd_create(ap)
struct vop_create_args /* {
struct vnode *a_dvp;
@@ -6265,6 +5016,7 @@ zfs_freebsd_create(ap)
struct vattr *a_vap;
} */ *ap;
{
+ zfsvfs_t *zfsvfs;
struct componentname *cnp = ap->a_cnp;
vattr_t *vap = ap->a_vap;
int error, mode;
@@ -6273,13 +5025,13 @@ zfs_freebsd_create(ap)
vattr_init_mask(vap);
mode = vap->va_mode & ALLPERMS;
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
-#ifdef FREEBSD_NAMECACHE
- if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+ if (zfsvfs->z_use_namecache &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
-#endif
return (error);
}
@@ -6294,8 +5046,8 @@ zfs_freebsd_remove(ap)
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
- return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
- ap->a_cnp->cn_cred, NULL, 0));
+ return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+ ap->a_cnp->cn_cred));
}
static int
@@ -6314,7 +5066,7 @@ zfs_freebsd_mkdir(ap)
vattr_init_mask(vap);
return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
- ap->a_cnp->cn_cred, NULL, 0, NULL));
+ ap->a_cnp->cn_cred));
}
static int
@@ -6329,7 +5081,7 @@ zfs_freebsd_rmdir(ap)
ASSERT(cnp->cn_flags & SAVENAME);
- return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
+ return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
}
static int
@@ -6563,23 +5315,14 @@ zfs_freebsd_rename(ap)
ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
- /*
- * Check for cross-device rename.
- */
- if ((fdvp->v_mount != tdvp->v_mount) ||
- (tvp && (fdvp->v_mount != tvp->v_mount)))
- error = EXDEV;
- else
- error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
- ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
- if (tdvp == tvp)
- VN_RELE(tdvp);
- else
- VN_URELE(tdvp);
- if (tvp)
- VN_URELE(tvp);
- VN_RELE(fdvp);
- VN_RELE(fvp);
+ error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+ ap->a_tcnp, ap->a_fcnp->cn_cred);
+
+ vrele(fdvp);
+ vrele(fvp);
+ vrele(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
return (error);
}
@@ -7250,6 +5993,39 @@ zfs_vptocnp(struct vop_vptocnp_args *ap)
return (error);
}
+#ifdef DIAGNOSTIC
+static int
+zfs_lock(ap)
+ struct vop_lock1_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ char *file;
+ int line;
+ } */ *ap;
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *zp;
+ vnode_t *vp;
+ int flags;
+ int err;
+
+ vp = ap->a_vp;
+ flags = ap->a_flags;
+ if ((flags & LK_INTERLOCK) == 0 && (flags & LK_NOWAIT) == 0 &&
+ (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
+ zfsvfs = zp->z_zfsvfs;
+ VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+ }
+ err = vop_stdlock(ap);
+ if ((flags & LK_INTERLOCK) != 0 && (flags & LK_NOWAIT) == 0 &&
+ (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
+ zfsvfs = zp->z_zfsvfs;
+ VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+ }
+ return (err);
+}
+#endif
+
struct vop_vector zfs_vnodeops;
struct vop_vector zfs_fifoops;
struct vop_vector zfs_shareops;
@@ -7259,12 +6035,8 @@ struct vop_vector zfs_vnodeops = {
.vop_inactive = zfs_freebsd_inactive,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_access = zfs_freebsd_access,
-#ifdef FREEBSD_NAMECACHE
- .vop_lookup = vfs_cache_lookup,
+ .vop_lookup = zfs_cache_lookup,
.vop_cachedlookup = zfs_freebsd_lookup,
-#else
- .vop_lookup = zfs_freebsd_lookup,
-#endif
.vop_getattr = zfs_freebsd_getattr,
.vop_setattr = zfs_freebsd_setattr,
.vop_create = zfs_freebsd_create,
@@ -7296,6 +6068,9 @@ struct vop_vector zfs_vnodeops = {
.vop_getpages = zfs_freebsd_getpages,
.vop_putpages = zfs_freebsd_putpages,
.vop_vptocnp = zfs_vptocnp,
+#ifdef DIAGNOSTIC
+ .vop_lock1 = zfs_lock,
+#endif
};
struct vop_vector zfs_fifoops = {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index 3853838..c947e54 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -124,16 +124,12 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
list_link_init(&zp->z_link_node);
- mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
- zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
zp->z_vnode = NULL;
zp->z_moved = 0;
@@ -150,14 +146,10 @@ zfs_znode_cache_destructor(void *buf, void *arg)
ASSERT(ZTOV(zp) == NULL);
vn_free(ZTOV(zp));
ASSERT(!list_link_active(&zp->z_link_node));
- mutex_destroy(&zp->z_lock);
- rw_destroy(&zp->z_parent_lock);
- rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock);
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);
- ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
}
@@ -559,8 +551,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
- mutex_enter(&zp->z_lock);
-
ASSERT(zp->z_sa_hdl == NULL);
ASSERT(zp->z_acl_cached == NULL);
if (sa_hdl == NULL) {
@@ -580,7 +570,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
ZTOV(zp)->v_flag |= VROOT;
- mutex_exit(&zp->z_lock);
vn_exists(ZTOV(zp));
}
@@ -637,7 +626,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_vnode = vp;
vp->v_data = zp;
- ASSERT(zp->z_dirlocks == NULL);
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
zp->z_moved = 0;
@@ -739,7 +727,14 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
/*
* Acquire vnode lock before making it available to the world.
*/
+#ifdef DIAGNOSTIC
+ vop_lock1_t *orig_lock = vp->v_op->vop_lock1;
+ vp->v_op->vop_lock1 = vop_stdlock;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ vp->v_op->vop_lock1 = orig_lock;
+#else
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#endif
VN_LOCK_AREC(vp);
if (vp->v_type != VFIFO)
VN_LOCK_ASHARE(vp);
@@ -1161,54 +1156,55 @@ again:
if (hdl != NULL) {
zp = sa_get_userdata(hdl);
-
/*
* Since "SA" does immediate eviction we
* should never find a sa handle that doesn't
* know about the znode.
*/
-
ASSERT3P(zp, !=, NULL);
-
- mutex_enter(&zp->z_lock);
ASSERT3U(zp->z_id, ==, obj_num);
- if (zp->z_unlinked) {
- err = SET_ERROR(ENOENT);
- } else {
- vp = ZTOV(zp);
- *zpp = zp;
- err = 0;
- }
+ *zpp = zp;
+ vp = ZTOV(zp);
/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
- if (err == 0)
- VN_HOLD(vp);
+ VN_HOLD(vp);
- mutex_exit(&zp->z_lock);
sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- if (err == 0) {
- locked = VOP_ISLOCKED(vp);
- VI_LOCK(vp);
- if ((vp->v_iflag & VI_DOOMED) != 0 &&
- locked != LK_EXCLUSIVE) {
- /*
- * The vnode is doomed and this thread doesn't
- * hold the exclusive lock on it, so the vnode
- * must be being reclaimed by another thread.
- * Otherwise the doomed vnode is being reclaimed
- * by this thread and zfs_zget is called from
- * ZIL internals.
- */
- VI_UNLOCK(vp);
- VN_RELE(vp);
- goto again;
- }
+ locked = VOP_ISLOCKED(vp);
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOOMED) != 0 &&
+ locked != LK_EXCLUSIVE) {
+ /*
+ * The vnode is doomed and this thread doesn't
+ * hold the exclusive lock on it, so the vnode
+ * must be being reclaimed by another thread.
+ * Otherwise the doomed vnode is being reclaimed
+ * by this thread and zfs_zget is called from
+ * ZIL internals.
+ */
VI_UNLOCK(vp);
+
+ /*
+ * XXX vrele() locks the vnode when the last reference
+ * is dropped. Although in this case the vnode is
+ * doomed / dead and so no inactivation is required,
+ * the vnode lock is still acquired. That could result
+ * in a LOR with z_teardown_lock if another thread holds
+ * the vnode's lock and tries to take z_teardown_lock.
+ * But that is only possible if the other thread peforms
+ * a ZFS vnode operation on the vnode. That either
+ * should not happen if the vnode is dead or the thread
+ * should also have a refrence to the vnode and thus
+ * our reference is not last.
+ */
+ VN_RELE(vp);
+ goto again;
}
+ VI_UNLOCK(vp);
getnewvnode_drop_reserve();
- return (err);
+ return (0);
}
/*
@@ -1391,20 +1387,16 @@ zfs_zinactive(znode_t *zp)
*/
ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
- mutex_enter(&zp->z_lock);
-
/*
* If this was the last reference to a file with no links,
* remove the file from the file system.
*/
if (zp->z_unlinked) {
- mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
zfs_rmnode(zp);
return;
}
- mutex_exit(&zp->z_lock);
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
zfs_znode_free(zp);
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index e6b1e90..0da005a 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -290,7 +290,10 @@ dev/lindev/full.c optional lindev
dev/lindev/lindev.c optional lindev
dev/nfe/if_nfe.c optional nfe pci
dev/ntb/if_ntb/if_ntb.c optional if_ntb
-dev/ntb/ntb_hw/ntb_hw.c optional if_ntb ntb_hw
+dev/ntb/ntb_transport.c optional if_ntb
+dev/ntb/ntb.c optional if_ntb | ntb_hw
+dev/ntb/ntb_if.m optional if_ntb | ntb_hw
+dev/ntb/ntb_hw/ntb_hw.c optional ntb_hw
dev/nvd/nvd.c optional nvd nvme
dev/nve/if_nve.c optional nve pci
dev/nvme/nvme.c optional nvme
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index fe93e68..e46fe53 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -286,7 +286,10 @@ dev/mse/mse.c optional mse
dev/mse/mse_isa.c optional mse isa
dev/nfe/if_nfe.c optional nfe pci
dev/ntb/if_ntb/if_ntb.c optional if_ntb
-dev/ntb/ntb_hw/ntb_hw.c optional if_ntb | ntb_hw
+dev/ntb/ntb_transport.c optional if_ntb
+dev/ntb/ntb.c optional if_ntb | ntb_hw
+dev/ntb/ntb_if.m optional if_ntb | ntb_hw
+dev/ntb/ntb_hw/ntb_hw.c optional ntb_hw
dev/nvd/nvd.c optional nvd nvme
dev/nve/if_nve.c optional nve pci
dev/nvme/nvme.c optional nvme
diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c
index 9db1c44..1295de7 100644
--- a/sys/dev/ahci/ahci.c
+++ b/sys/dev/ahci/ahci.c
@@ -373,7 +373,8 @@ ahci_setup_interrupt(device_t dev)
else if (ctlr->numirqs == 1 || i >= ctlr->channels ||
(ctlr->ccc && i == ctlr->cccv))
ctlr->irqs[i].mode = AHCI_IRQ_MODE_ALL;
- else if (i == ctlr->numirqs - 1)
+ else if (ctlr->channels > ctlr->numirqs &&
+ i == ctlr->numirqs - 1)
ctlr->irqs[i].mode = AHCI_IRQ_MODE_AFTER;
else
ctlr->irqs[i].mode = AHCI_IRQ_MODE_ONE;
@@ -422,6 +423,7 @@ ahci_intr(void *data)
} else { /* AHCI_IRQ_MODE_AFTER */
unit = irq->r_irq_rid - 1;
is = ATA_INL(ctlr->r_mem, AHCI_IS);
+ is &= (0xffffffff << unit);
}
/* CCC interrupt is edge triggered. */
if (ctlr->ccc)
diff --git a/sys/dev/ahci/ahci_pci.c b/sys/dev/ahci/ahci_pci.c
index 22f28e2..bb14ed6 100644
--- a/sys/dev/ahci/ahci_pci.c
+++ b/sys/dev/ahci/ahci_pci.c
@@ -187,7 +187,7 @@ static const struct {
{0xa10f8086, 0x00, "Intel Sunrise Point (RAID)", 0},
{0x23238086, 0x00, "Intel DH89xxCC", 0},
{0x2360197b, 0x00, "JMicron JMB360", 0},
- {0x2361197b, 0x00, "JMicron JMB361", AHCI_Q_NOFORCE},
+ {0x2361197b, 0x00, "JMicron JMB361", AHCI_Q_NOFORCE | AHCI_Q_1CH},
{0x2362197b, 0x00, "JMicron JMB362", 0},
{0x2363197b, 0x00, "JMicron JMB363", AHCI_Q_NOFORCE},
{0x2365197b, 0x00, "JMicron JMB365", AHCI_Q_NOFORCE},
diff --git a/sys/dev/e1000/e1000_api.c b/sys/dev/e1000/e1000_api.c
index 28379cc..52e2609 100644
--- a/sys/dev/e1000/e1000_api.c
+++ b/sys/dev/e1000/e1000_api.c
@@ -304,6 +304,10 @@ s32 e1000_set_mac_type(struct e1000_hw *hw)
case E1000_DEV_ID_PCH_SPT_I219_LM2:
case E1000_DEV_ID_PCH_SPT_I219_V2:
case E1000_DEV_ID_PCH_LBG_I219_LM3:
+ case E1000_DEV_ID_PCH_SPT_I219_LM4:
+ case E1000_DEV_ID_PCH_SPT_I219_V4:
+ case E1000_DEV_ID_PCH_SPT_I219_LM5:
+ case E1000_DEV_ID_PCH_SPT_I219_V5:
mac->type = e1000_pch_spt;
break;
case E1000_DEV_ID_82575EB_COPPER:
diff --git a/sys/dev/e1000/e1000_hw.h b/sys/dev/e1000/e1000_hw.h
index 1792e14..e1464a7 100644
--- a/sys/dev/e1000/e1000_hw.h
+++ b/sys/dev/e1000/e1000_hw.h
@@ -142,6 +142,10 @@ struct e1000_hw;
#define E1000_DEV_ID_PCH_SPT_I219_LM2 0x15B7 /* Sunrise Point-H PCH */
#define E1000_DEV_ID_PCH_SPT_I219_V2 0x15B8 /* Sunrise Point-H PCH */
#define E1000_DEV_ID_PCH_LBG_I219_LM3 0x15B9 /* LEWISBURG PCH */
+#define E1000_DEV_ID_PCH_SPT_I219_LM4 0x15D7
+#define E1000_DEV_ID_PCH_SPT_I219_V4 0x15D8
+#define E1000_DEV_ID_PCH_SPT_I219_LM5 0x15E3
+#define E1000_DEV_ID_PCH_SPT_I219_V5 0x15D6
#define E1000_DEV_ID_82576 0x10C9
#define E1000_DEV_ID_82576_FIBER 0x10E6
#define E1000_DEV_ID_82576_SERDES 0x10E7
@@ -957,9 +961,13 @@ struct e1000_dev_spec_ich8lan {
E1000_MUTEX nvm_mutex;
E1000_MUTEX swflag_mutex;
bool nvm_k1_enabled;
+ bool disable_k1_off;
bool eee_disable;
u16 eee_lp_ability;
enum e1000_ulp_state ulp_state;
+ bool ulp_capability_disabled;
+ bool during_suspend_flow;
+ bool during_dpg_exit;
};
struct e1000_dev_spec_82575 {
diff --git a/sys/dev/e1000/e1000_ich8lan.c b/sys/dev/e1000/e1000_ich8lan.c
index 9b9a090..4c93662 100644
--- a/sys/dev/e1000/e1000_ich8lan.c
+++ b/sys/dev/e1000/e1000_ich8lan.c
@@ -288,7 +288,7 @@ static void e1000_toggle_lanphypc_pch_lpt(struct e1000_hw *hw)
mac_reg &= ~E1000_CTRL_LANPHYPC_VALUE;
E1000_WRITE_REG(hw, E1000_CTRL, mac_reg);
E1000_WRITE_FLUSH(hw);
- usec_delay(10);
+ msec_delay(1);
mac_reg &= ~E1000_CTRL_LANPHYPC_OVERRIDE;
E1000_WRITE_REG(hw, E1000_CTRL, mac_reg);
E1000_WRITE_FLUSH(hw);
@@ -1625,7 +1625,17 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
hw->phy.ops.write_reg_locked(hw,
I217_PLL_CLOCK_GATE_REG,
phy_reg);
- }
+
+ if (speed == SPEED_1000) {
+ hw->phy.ops.read_reg_locked(hw, HV_PM_CTRL,
+ &phy_reg);
+
+ phy_reg |= HV_PM_CTRL_K1_CLK_REQ;
+
+ hw->phy.ops.write_reg_locked(hw, HV_PM_CTRL,
+ phy_reg);
+ }
+ }
hw->phy.ops.release(hw);
if (ret_val)
@@ -1718,7 +1728,8 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
u32 pcieanacfg = E1000_READ_REG(hw, E1000_PCIEANACFG);
u32 fextnvm6 = E1000_READ_REG(hw, E1000_FEXTNVM6);
- if (pcieanacfg & E1000_FEXTNVM6_K1_OFF_ENABLE)
+ if ((pcieanacfg & E1000_FEXTNVM6_K1_OFF_ENABLE) &&
+ (hw->dev_spec.ich8lan.disable_k1_off == FALSE))
fextnvm6 |= E1000_FEXTNVM6_K1_OFF_ENABLE;
else
fextnvm6 &= ~E1000_FEXTNVM6_K1_OFF_ENABLE;
diff --git a/sys/dev/e1000/e1000_ich8lan.h b/sys/dev/e1000/e1000_ich8lan.h
index edc1dd1..6d81291 100644
--- a/sys/dev/e1000/e1000_ich8lan.h
+++ b/sys/dev/e1000/e1000_ich8lan.h
@@ -239,7 +239,7 @@
/* PHY Power Management Control */
#define HV_PM_CTRL PHY_REG(770, 17)
-#define HV_PM_CTRL_PLL_STOP_IN_K1_GIGA 0x100
+#define HV_PM_CTRL_K1_CLK_REQ 0x200
#define HV_PM_CTRL_K1_ENABLE 0x4000
#define I217_PLL_CLOCK_GATE_REG PHY_REG(772, 28)
diff --git a/sys/dev/e1000/e1000_phy.c b/sys/dev/e1000/e1000_phy.c
index b2bec3e..9684b43 100644
--- a/sys/dev/e1000/e1000_phy.c
+++ b/sys/dev/e1000/e1000_phy.c
@@ -4148,10 +4148,10 @@ s32 e1000_read_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 *data)
/* Disable access to mPHY if it was originally disabled */
if (locked)
ready = e1000_is_mphy_ready(hw);
- if (!ready)
- return -E1000_ERR_PHY;
- E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
- E1000_MPHY_DIS_ACCESS);
+ if (!ready)
+ return -E1000_ERR_PHY;
+ E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
+ E1000_MPHY_DIS_ACCESS);
return E1000_SUCCESS;
}
@@ -4213,10 +4213,10 @@ s32 e1000_write_phy_reg_mphy(struct e1000_hw *hw, u32 address, u32 data,
/* Disable access to mPHY if it was originally disabled */
if (locked)
ready = e1000_is_mphy_ready(hw);
- if (!ready)
- return -E1000_ERR_PHY;
- E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
- E1000_MPHY_DIS_ACCESS);
+ if (!ready)
+ return -E1000_ERR_PHY;
+ E1000_WRITE_REG(hw, E1000_MPHY_ADDR_CTRL,
+ E1000_MPHY_DIS_ACCESS);
return E1000_SUCCESS;
}
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index 46f3e48..6b6b791 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -192,6 +192,12 @@ static em_vendor_info_t em_vendor_info_array[] =
{ 0x8086, E1000_DEV_ID_PCH_SPT_I219_V2, PCI_ANY_ID, PCI_ANY_ID, 0},
{ 0x8086, E1000_DEV_ID_PCH_LBG_I219_LM3,
PCI_ANY_ID, PCI_ANY_ID, 0},
+ { 0x8086, E1000_DEV_ID_PCH_SPT_I219_LM4,
+ PCI_ANY_ID, PCI_ANY_ID, 0},
+ { 0x8086, E1000_DEV_ID_PCH_SPT_I219_V4, PCI_ANY_ID, PCI_ANY_ID, 0},
+ { 0x8086, E1000_DEV_ID_PCH_SPT_I219_LM5,
+ PCI_ANY_ID, PCI_ANY_ID, 0},
+ { 0x8086, E1000_DEV_ID_PCH_SPT_I219_V5, PCI_ANY_ID, PCI_ANY_ID, 0},
/* required last entry */
{ 0, 0, 0, 0, 0}
};
diff --git a/sys/dev/filemon/filemon.c b/sys/dev/filemon/filemon.c
index 919af9d..26e1bc3 100644
--- a/sys/dev/filemon/filemon.c
+++ b/sys/dev/filemon/filemon.c
@@ -137,6 +137,8 @@ filemon_proc_get(struct proc *p)
{
struct filemon *filemon;
+ if (p->p_filemon == NULL)
+ return (NULL);
PROC_LOCK(p);
filemon = filemon_acquire(p->p_filemon);
PROC_UNLOCK(p);
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
index 936e4e1..18626cb 100644
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -810,6 +810,7 @@ hv_storvsc_on_iocompletion(struct storvsc_softc *sc,
* because the fields will be used later in storvsc_io_done().
*/
request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status;
+ request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status;
request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len;
if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) &&
@@ -1945,28 +1946,6 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
return(0);
}
-/*
- * SCSI Inquiry checks qualifier and type.
- * If qualifier is 011b, means the device server is not capable
- * of supporting a peripheral device on this logical unit, and
- * the type should be set to 1Fh.
- *
- * Return 1 if it is valid, 0 otherwise.
- */
-static inline int
-is_inquiry_valid(const struct scsi_inquiry_data *inq_data)
-{
- uint8_t type;
- if (SID_QUAL(inq_data) != SID_QUAL_LU_CONNECTED) {
- return (0);
- }
- type = SID_TYPE(inq_data);
- if (type == T_NODEVICE) {
- return (0);
- }
- return (1);
-}
-
/**
* @brief completion function before returning to CAM
*
@@ -1985,7 +1964,6 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
bus_dma_segment_t *ori_sglist = NULL;
int ori_sg_count = 0;
-
/* destroy bounce buffer if it is used */
if (reqp->bounce_sgl_count) {
ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
@@ -2040,88 +2018,71 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
ccb->ccb_h.status &= ~CAM_STATUS_MASK;
if (vm_srb->scsi_status == SCSI_STATUS_OK) {
const struct scsi_generic *cmd;
- /*
- * Check whether the data for INQUIRY cmd is valid or
- * not. Windows 10 and Windows 2016 send all zero
- * inquiry data to VM even for unpopulated slots.
- */
+
+ if (vm_srb->srb_status != SRB_STATUS_SUCCESS) {
+ if (vm_srb->srb_status == SRB_STATUS_INVALID_LUN) {
+ xpt_print(ccb->ccb_h.path, "invalid LUN %d\n",
+ vm_srb->lun);
+ } else {
+ xpt_print(ccb->ccb_h.path, "Unknown SRB flag: %d\n",
+ vm_srb->srb_status);
+ }
+ /*
+ * If there are errors, for example, invalid LUN,
+ * host will inform VM through SRB status.
+ */
+ ccb->ccb_h.status |= CAM_SEL_TIMEOUT;
+ } else {
+ ccb->ccb_h.status |= CAM_REQ_CMP;
+ }
+
cmd = (const struct scsi_generic *)
((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes);
if (cmd->opcode == INQUIRY) {
- /*
- * The host of Windows 10 or 2016 server will response
- * the inquiry request with invalid data for unexisted device:
- [0x7f 0x0 0x5 0x2 0x1f ... ]
- * But on windows 2012 R2, the response is:
- [0x7f 0x0 0x0 0x0 0x0 ]
- * That is why here wants to validate the inquiry response.
- * The validation will skip the INQUIRY whose response is short,
- * which is less than SHORT_INQUIRY_LENGTH (36).
- *
- * For more information about INQUIRY, please refer to:
- * ftp://ftp.avc-pioneer.com/Mtfuji_7/Proposal/Jun09/INQUIRY.pdf
- */
- struct scsi_inquiry_data *inq_data =
- (struct scsi_inquiry_data *)csio->data_ptr;
- uint8_t* resp_buf = (uint8_t*)csio->data_ptr;
- /* Get the buffer length reported by host */
- int resp_xfer_len = vm_srb->transfer_len;
- /* Get the available buffer length */
- int resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
- int data_len = (resp_buf_len < resp_xfer_len) ? resp_buf_len : resp_xfer_len;
- if (data_len < SHORT_INQUIRY_LENGTH) {
- ccb->ccb_h.status |= CAM_REQ_CMP;
- if (bootverbose && data_len >= 5) {
- mtx_lock(&sc->hs_lock);
- xpt_print(ccb->ccb_h.path,
- "storvsc skips the validation for short inquiry (%d)"
- " [%x %x %x %x %x]\n",
- data_len,resp_buf[0],resp_buf[1],resp_buf[2],
- resp_buf[3],resp_buf[4]);
- mtx_unlock(&sc->hs_lock);
- }
- } else if (is_inquiry_valid(inq_data) == 0) {
- ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
+ struct scsi_inquiry_data *inq_data =
+ (struct scsi_inquiry_data *)csio->data_ptr;
+ uint8_t *resp_buf = (uint8_t *)csio->data_ptr;
+ int resp_xfer_len, resp_buf_len, data_len;
+
+ /* Get the buffer length reported by host */
+ resp_xfer_len = vm_srb->transfer_len;
+ /* Get the available buffer length */
+ resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
+ data_len = (resp_buf_len < resp_xfer_len) ?
+ resp_buf_len : resp_xfer_len;
+
if (bootverbose && data_len >= 5) {
- mtx_lock(&sc->hs_lock);
- xpt_print(ccb->ccb_h.path,
- "storvsc uninstalled invalid device"
- " [%x %x %x %x %x]\n",
- resp_buf[0],resp_buf[1],resp_buf[2],resp_buf[3],resp_buf[4]);
- mtx_unlock(&sc->hs_lock);
+ xpt_print(ccb->ccb_h.path, "storvsc inquiry "
+ "(%d) [%x %x %x %x %x ... ]\n", data_len,
+ resp_buf[0], resp_buf[1], resp_buf[2],
+ resp_buf[3], resp_buf[4]);
}
- } else {
- char vendor[16];
- cam_strvis(vendor, inq_data->vendor, sizeof(inq_data->vendor),
- sizeof(vendor));
- /**
- * XXX: upgrade SPC2 to SPC3 if host is WIN8 or WIN2012 R2
- * in order to support UNMAP feature
- */
- if (!strncmp(vendor,"Msft",4) &&
- SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
- (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
- vmstor_proto_version== VMSTOR_PROTOCOL_VERSION_WIN8)) {
- inq_data->version = SCSI_REV_SPC3;
- if (bootverbose) {
- mtx_lock(&sc->hs_lock);
- xpt_print(ccb->ccb_h.path,
- "storvsc upgrades SPC2 to SPC3\n");
- mtx_unlock(&sc->hs_lock);
+ if (vm_srb->srb_status == SRB_STATUS_SUCCESS &&
+ data_len > SHORT_INQUIRY_LENGTH) {
+ char vendor[16];
+
+ cam_strvis(vendor, inq_data->vendor,
+ sizeof(inq_data->vendor), sizeof(vendor));
+
+ /*
+ * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or
+ * WIN2012 R2 in order to support UNMAP feature.
+ */
+ if (!strncmp(vendor, "Msft", 4) &&
+ SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
+ (vmstor_proto_version ==
+ VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
+ vmstor_proto_version ==
+ VMSTOR_PROTOCOL_VERSION_WIN8)) {
+ inq_data->version = SCSI_REV_SPC3;
+ if (bootverbose) {
+ xpt_print(ccb->ccb_h.path,
+ "storvsc upgrades "
+ "SPC2 to SPC3\n");
+ }
}
}
- ccb->ccb_h.status |= CAM_REQ_CMP;
- if (bootverbose) {
- mtx_lock(&sc->hs_lock);
- xpt_print(ccb->ccb_h.path,
- "storvsc has passed inquiry response (%d) validation\n",
- data_len);
- mtx_unlock(&sc->hs_lock);
- }
- }
- } else {
- ccb->ccb_h.status |= CAM_REQ_CMP;
}
} else {
mtx_lock(&sc->hs_lock);
diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h
index f2b9480..9205e35 100644
--- a/sys/dev/hyperv/storvsc/hv_vstorage.h
+++ b/sys/dev/hyperv/storvsc/hv_vstorage.h
@@ -249,9 +249,9 @@ struct vstor_packet {
/**
* SRB Status Masks (can be combined with above status codes)
*/
-#define SRB_STATUS_QUEUE_FROZEN 0x40
-#define SRB_STATUS_AUTOSENSE_VALID 0x80
-
+#define SRB_STATUS_QUEUE_FROZEN 0x40
+#define SRB_STATUS_AUTOSENSE_VALID 0x80
+#define SRB_STATUS_INVALID_LUN 0X20
/**
* Packet flags
diff --git a/sys/dev/isp/isp.c b/sys/dev/isp/isp.c
index aa36453..9d38f60 100644
--- a/sys/dev/isp/isp.c
+++ b/sys/dev/isp/isp.c
@@ -2431,6 +2431,7 @@ isp_fc_enable_vp(ispsoftc_t *isp, int chan)
__func__, chan, vp.vp_mod_hdr.rqs_flags, vp.vp_mod_status);
return (EIO);
}
+ GET_NANOTIME(&isp->isp_init_time);
return (0);
}
@@ -5865,6 +5866,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
* These are broadcast events that have to be sent across
* all active channels.
*/
+ GET_NANOTIME(&isp->isp_init_time);
for (chan = 0; chan < isp->isp_nchan; chan++) {
fcp = FCPARAM(isp, chan);
int topo = fcp->isp_topo;
@@ -5921,6 +5923,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
* This is a broadcast event that has to be sent across
* all active channels.
*/
+ GET_NANOTIME(&isp->isp_init_time);
for (chan = 0; chan < isp->isp_nchan; chan++) {
fcp = FCPARAM(isp, chan);
if (fcp->role == ISP_ROLE_NONE)
@@ -5964,6 +5967,7 @@ isp_parse_async_fc(ispsoftc_t *isp, uint16_t mbox)
* This is a broadcast event that has to be sent across
* all active channels.
*/
+ GET_NANOTIME(&isp->isp_init_time);
for (chan = 0; chan < isp->isp_nchan; chan++) {
fcp = FCPARAM(isp, chan);
if (fcp->role == ISP_ROLE_NONE)
@@ -6162,6 +6166,7 @@ isp_handle_other_response(ispsoftc_t *isp, int type, isphdr_t *hp, uint32_t *opt
portid = (uint32_t)rid.ridacq_vp_port_hi << 16 |
rid.ridacq_vp_port_lo;
if (rid.ridacq_format == 0) {
+ GET_NANOTIME(&isp->isp_init_time);
for (chan = 0; chan < isp->isp_nchan; chan++) {
fcparam *fcp = FCPARAM(isp, chan);
if (fcp->role == ISP_ROLE_NONE)
diff --git a/sys/dev/isp/isp_freebsd.c b/sys/dev/isp/isp_freebsd.c
index c6b8dc4..cfaccea 100644
--- a/sys/dev/isp/isp_freebsd.c
+++ b/sys/dev/isp/isp_freebsd.c
@@ -856,7 +856,7 @@ static void isp_handle_platform_atio7(ispsoftc_t *, at7_entry_t *);
static void isp_handle_platform_ctio(ispsoftc_t *, void *);
static void isp_handle_platform_notify_fc(ispsoftc_t *, in_fcentry_t *);
static void isp_handle_platform_notify_24xx(ispsoftc_t *, in_fcentry_24xx_t *);
-static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *);
+static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *, uint32_t rsp);
static void isp_handle_platform_target_tmf(ispsoftc_t *, isp_notify_t *);
static void isp_target_mark_aborted(ispsoftc_t *, union ccb *);
static void isp_target_mark_aborted_early(ispsoftc_t *, tstate_t *, uint32_t);
@@ -2003,7 +2003,7 @@ noresrc:
ntp = isp_get_ntpd(isp, tptr);
if (ntp == NULL) {
rls_lun_statep(isp, tptr);
- isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_BUSY, 0);
+ isp_endcmd(isp, aep, SCSI_STATUS_BUSY, 0);
return;
}
memcpy(ntp->rd.data, aep, QENTRY_LEN);
@@ -2055,7 +2055,7 @@ isp_handle_platform_atio7(ispsoftc_t *isp, at7_entry_t *aep)
* It's a bit tricky here as we need to stash this command *somewhere*.
*/
GET_NANOTIME(&now);
- if (NANOTIME_SUB(&isp->isp_init_time, &now) > 2000000000ULL) {
+ if (NANOTIME_SUB(&now, &isp->isp_init_time) > 2000000000ULL) {
isp_prt(isp, ISP_LOGWARN, "%s: [RX_ID 0x%x] D_ID %x not found on any channel- dropping", __func__, aep->at_rxid, did);
isp_endcmd(isp, aep, NIL_HANDLE, ISP_NOCHAN, ECMD_TERMINATE, 0);
return;
@@ -2761,7 +2761,7 @@ isp_handle_platform_notify_24xx(ispsoftc_t *isp, in_fcentry_24xx_t *inot)
}
static int
-isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp)
+isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp, uint32_t rsp)
{
if (isp->isp_state != ISP_RUNSTATE) {
@@ -2796,6 +2796,15 @@ isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp)
cto->ct_oxid = aep->at_hdr.ox_id;
cto->ct_flags = CT7_SENDSTATUS|CT7_NOACK|CT7_NO_DATA|CT7_FLAG_MODE1;
cto->ct_flags |= (aep->at_ta_len >> 12) << CT7_TASK_ATTR_SHIFT;
+ if (rsp != 0) {
+ cto->ct_scsi_status |= (FCP_RSPLEN_VALID << 8);
+ cto->rsp.m1.ct_resplen = 4;
+ ISP_MEMZERO(cto->rsp.m1.ct_resp, sizeof (cto->rsp.m1.ct_resp));
+ cto->rsp.m1.ct_resp[0] = rsp & 0xff;
+ cto->rsp.m1.ct_resp[1] = (rsp >> 8) & 0xff;
+ cto->rsp.m1.ct_resp[2] = (rsp >> 16) & 0xff;
+ cto->rsp.m1.ct_resp[3] = (rsp >> 24) & 0xff;
+ }
return (isp_target_put_entry(isp, &local));
}
@@ -3642,7 +3651,8 @@ isp_action(struct cam_sim *sim, union ccb *ccb)
xpt_done(ccb);
break;
}
- if (isp_handle_platform_target_notify_ack(isp, &ntp->rd.nt)) {
+ if (isp_handle_platform_target_notify_ack(isp, &ntp->rd.nt,
+ (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0)) {
rls_lun_statep(isp, tptr);
cam_freeze_devq(ccb->ccb_h.path);
cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 1000, 0);
@@ -4407,11 +4417,11 @@ changed:
/*
* This is device arrival/departure notification
*/
- isp_handle_platform_target_notify_ack(isp, notify);
+ isp_handle_platform_target_notify_ack(isp, notify, 0);
break;
default:
isp_prt(isp, ISP_LOGALL, "target notify code 0x%x", notify->nt_ncode);
- isp_handle_platform_target_notify_ack(isp, notify);
+ isp_handle_platform_target_notify_ack(isp, notify, 0);
break;
}
break;
diff --git a/sys/dev/ntb/if_ntb/if_ntb.c b/sys/dev/ntb/if_ntb/if_ntb.c
index d107d06..33645c4 100644
--- a/sys/dev/ntb/if_ntb/if_ntb.c
+++ b/sys/dev/ntb/if_ntb/if_ntb.c
@@ -1,4 +1,5 @@
/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
* Copyright (C) 2013 Intel Corporation
* Copyright (C) 2015 EMC Corporation
* All rights reserved.
@@ -25,21 +26,27 @@
* SUCH DAMAGE.
*/
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a driver for simulated Ethernet device, using
+ * underlying NTB Transport device.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
-#include <sys/bitset.h>
+#include <sys/buf_ring.h>
#include <sys/bus.h>
-#include <sys/ktr.h>
#include <sys/limits.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
#include <sys/module.h>
-#include <sys/mutex.h>
-#include <sys/queue.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>
@@ -48,426 +55,163 @@ __FBSDID("$FreeBSD$");
#include <net/if.h>
#include <net/if_media.h>
#include <net/if_types.h>
+#include <net/if_media.h>
#include <net/if_var.h>
#include <net/bpf.h>
#include <net/ethernet.h>
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
#include <machine/bus.h>
-#include <machine/cpufunc.h>
-#include <machine/pmap.h>
-
-#include <netinet/in.h>
-#include <netinet/ip.h>
-
-#include "../ntb_hw/ntb_hw.h"
-
-/*
- * The Non-Transparent Bridge (NTB) is a device on some Intel processors that
- * allows you to connect two systems using a PCI-e link.
- *
- * This module contains a protocol for sending and receiving messages, and
- * exposes that protocol through a simulated ethernet device called ntb.
- *
- * NOTE: Much of the code in this module is shared with Linux. Any patches may
- * be picked up and redistributed in Linux with a dual GPL/BSD license.
- */
-#define QP_SETSIZE 64
-BITSET_DEFINE(_qpset, QP_SETSIZE);
-#define test_bit(pos, addr) BIT_ISSET(QP_SETSIZE, (pos), (addr))
-#define set_bit(pos, addr) BIT_SET(QP_SETSIZE, (pos), (addr))
-#define clear_bit(pos, addr) BIT_CLR(QP_SETSIZE, (pos), (addr))
-#define ffs_bit(addr) BIT_FFS(QP_SETSIZE, (addr))
+#include "../ntb_transport.h"
#define KTR_NTB KTR_SPARE3
+#define NTB_MEDIATYPE (IFM_ETHER | IFM_AUTO | IFM_FDX)
-#define NTB_TRANSPORT_VERSION 4
-#define NTB_RX_MAX_PKTS 64
-#define NTB_RXQ_SIZE 300
-
-enum ntb_link_event {
- NTB_LINK_DOWN = 0,
- NTB_LINK_UP,
-};
+#define NTB_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)
+#define NTB_CSUM_FEATURES6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6)
+#define NTB_CSUM_SET (CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \
+ CSUM_PSEUDO_HDR | \
+ CSUM_IP_CHECKED | CSUM_IP_VALID | \
+ CSUM_SCTP_VALID)
static SYSCTL_NODE(_hw, OID_AUTO, if_ntb, CTLFLAG_RW, 0, "if_ntb");
-static unsigned g_if_ntb_debug_level;
-TUNABLE_INT("hw.if_ntb.debug_level", &g_if_ntb_debug_level);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, debug_level, CTLFLAG_RWTUN,
- &g_if_ntb_debug_level, 0, "if_ntb log level -- higher is more verbose");
-#define ntb_printf(lvl, ...) do { \
- if ((lvl) <= g_if_ntb_debug_level) { \
- if_printf(nt->ifp, __VA_ARGS__); \
- } \
-} while (0)
-
-static unsigned transport_mtu = IP_MAXPACKET + ETHER_HDR_LEN + ETHER_CRC_LEN;
-
-static uint64_t max_mw_size;
-TUNABLE_QUAD("hw.if_ntb.max_mw_size", &max_mw_size);
-SYSCTL_UQUAD(_hw_if_ntb, OID_AUTO, max_mw_size, CTLFLAG_RDTUN, &max_mw_size, 0,
- "If enabled (non-zero), limit the size of large memory windows. "
- "Both sides of the NTB MUST set the same value here.");
-
-static unsigned max_num_clients;
-TUNABLE_INT("hw.if_ntb.max_num_clients", &max_num_clients);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, max_num_clients, CTLFLAG_RDTUN,
- &max_num_clients, 0, "Maximum number of NTB transport clients. "
- "0 (default) - use all available NTB memory windows; "
- "positive integer N - Limit to N memory windows.");
-
-static unsigned enable_xeon_watchdog;
-TUNABLE_INT("hw.if_ntb.enable_xeon_watchdog", &enable_xeon_watchdog);
-SYSCTL_UINT(_hw_if_ntb, OID_AUTO, enable_xeon_watchdog, CTLFLAG_RDTUN,
- &enable_xeon_watchdog, 0, "If non-zero, write a register every second to "
- "keep a watchdog from tearing down the NTB link");
-
-STAILQ_HEAD(ntb_queue_list, ntb_queue_entry);
-
-typedef uint32_t ntb_q_idx_t;
-
-struct ntb_queue_entry {
- /* ntb_queue list reference */
- STAILQ_ENTRY(ntb_queue_entry) entry;
-
- /* info on data to be transferred */
- void *cb_data;
- void *buf;
- uint32_t len;
- uint32_t flags;
-
- struct ntb_transport_qp *qp;
- struct ntb_payload_header *x_hdr;
- ntb_q_idx_t index;
-};
-
-struct ntb_rx_info {
- ntb_q_idx_t entry;
-};
-
-struct ntb_transport_qp {
- struct ntb_transport_ctx *transport;
- struct ntb_softc *ntb;
-
- void *cb_data;
-
- bool client_ready;
- volatile bool link_is_up;
- uint8_t qp_num; /* Only 64 QPs are allowed. 0-63 */
-
- struct ntb_rx_info *rx_info;
- struct ntb_rx_info *remote_rx_info;
-
- void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
- void *data, int len);
- struct ntb_queue_list tx_free_q;
- struct mtx ntb_tx_free_q_lock;
- caddr_t tx_mw;
- bus_addr_t tx_mw_phys;
- ntb_q_idx_t tx_index;
- ntb_q_idx_t tx_max_entry;
- uint64_t tx_max_frame;
-
- void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
- void *data, int len);
- struct ntb_queue_list rx_post_q;
- struct ntb_queue_list rx_pend_q;
- /* ntb_rx_q_lock: synchronize access to rx_XXXX_q */
- struct mtx ntb_rx_q_lock;
- struct task rx_completion_task;
- struct task rxc_db_work;
- caddr_t rx_buff;
- ntb_q_idx_t rx_index;
- ntb_q_idx_t rx_max_entry;
- uint64_t rx_max_frame;
-
- void (*event_handler)(void *data, enum ntb_link_event status);
- struct callout link_work;
- struct callout queue_full;
- struct callout rx_full;
-
- uint64_t last_rx_no_buf;
+static unsigned g_if_ntb_num_queues = UINT_MAX;
+SYSCTL_UINT(_hw_if_ntb, OID_AUTO, num_queues, CTLFLAG_RWTUN,
+ &g_if_ntb_num_queues, 0, "Number of queues per interface");
- /* Stats */
- uint64_t rx_bytes;
- uint64_t rx_pkts;
- uint64_t rx_ring_empty;
- uint64_t rx_err_no_buf;
- uint64_t rx_err_oflow;
- uint64_t rx_err_ver;
- uint64_t tx_bytes;
- uint64_t tx_pkts;
- uint64_t tx_ring_full;
- uint64_t tx_err_no_buf;
-};
-
-struct ntb_queue_handlers {
- void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
- void *data, int len);
- void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
- void *data, int len);
- void (*event_handler)(void *data, enum ntb_link_event status);
-};
-
-struct ntb_transport_mw {
- vm_paddr_t phys_addr;
- size_t phys_size;
- size_t xlat_align;
- size_t xlat_align_size;
- bus_addr_t addr_limit;
- /* Tx buff is off vbase / phys_addr */
- caddr_t vbase;
- size_t xlat_size;
- size_t buff_size;
- /* Rx buff is off virt_addr / dma_addr */
- caddr_t virt_addr;
- bus_addr_t dma_addr;
-};
-
-struct ntb_transport_ctx {
- struct ntb_softc *ntb;
+struct ntb_net_queue {
+ struct ntb_net_ctx *sc;
struct ifnet *ifp;
- struct ntb_transport_mw mw_vec[NTB_MAX_NUM_MW];
- struct ntb_transport_qp *qp_vec;
- struct _qpset qp_bitmap;
- struct _qpset qp_bitmap_free;
- unsigned mw_count;
- unsigned qp_count;
- volatile bool link_is_up;
- struct callout link_work;
- struct callout link_watchdog;
- struct task link_cleanup;
- uint64_t bufsize;
- u_char eaddr[ETHER_ADDR_LEN];
- struct mtx tx_lock;
- struct mtx rx_lock;
-
- /* The hardcoded single queuepair in ntb_setup_interface() */
struct ntb_transport_qp *qp;
+ struct buf_ring *br;
+ struct task tx_task;
+ struct taskqueue *tx_tq;
+ struct mtx tx_lock;
+ struct callout queue_full;
};
-static struct ntb_transport_ctx net_softc;
-
-enum {
- IF_NTB_DESC_DONE_FLAG = 1 << 0,
- IF_NTB_LINK_DOWN_FLAG = 1 << 1,
-};
-
-struct ntb_payload_header {
- ntb_q_idx_t ver;
- uint32_t len;
- uint32_t flags;
-};
-
-enum {
- /*
- * The order of this enum is part of the if_ntb remote protocol. Do
- * not reorder without bumping protocol version (and it's probably best
- * to keep the protocol in lock-step with the Linux NTB driver.
- */
- IF_NTB_VERSION = 0,
- IF_NTB_QP_LINKS,
- IF_NTB_NUM_QPS,
- IF_NTB_NUM_MWS,
- /*
- * N.B.: transport_link_work assumes MW1 enums = MW0 + 2.
- */
- IF_NTB_MW0_SZ_HIGH,
- IF_NTB_MW0_SZ_LOW,
- IF_NTB_MW1_SZ_HIGH,
- IF_NTB_MW1_SZ_LOW,
- IF_NTB_MAX_SPAD,
-
- /*
- * Some NTB-using hardware have a watchdog to work around NTB hangs; if
- * a register or doorbell isn't written every few seconds, the link is
- * torn down. Write an otherwise unused register every few seconds to
- * work around this watchdog.
- */
- IF_NTB_WATCHDOG_SPAD = 15
+struct ntb_net_ctx {
+ struct ifnet *ifp;
+ struct ifmedia media;
+ u_char eaddr[ETHER_ADDR_LEN];
+ int num_queues;
+ struct ntb_net_queue *queues;
+ int mtu;
};
-CTASSERT(IF_NTB_WATCHDOG_SPAD < XEON_SPAD_COUNT &&
- IF_NTB_WATCHDOG_SPAD < ATOM_SPAD_COUNT);
-
-#define QP_TO_MW(nt, qp) ((qp) % nt->mw_count)
-#define NTB_QP_DEF_NUM_ENTRIES 100
-#define NTB_LINK_DOWN_TIMEOUT 10
-static int ntb_handle_module_events(struct module *m, int what, void *arg);
-static int ntb_setup_interface(void);
-static int ntb_teardown_interface(void);
+static int ntb_net_probe(device_t dev);
+static int ntb_net_attach(device_t dev);
+static int ntb_net_detach(device_t dev);
static void ntb_net_init(void *arg);
+static int ntb_ifmedia_upd(struct ifnet *);
+static void ntb_ifmedia_sts(struct ifnet *, struct ifmediareq *);
static int ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
-static void ntb_start(struct ifnet *ifp);
+static int ntb_transmit(struct ifnet *ifp, struct mbuf *m);
static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
void *data, int len);
static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data,
void *data, int len);
static void ntb_net_event_handler(void *data, enum ntb_link_event status);
-static int ntb_transport_probe(struct ntb_softc *ntb);
-static void ntb_transport_free(struct ntb_transport_ctx *);
-static void ntb_transport_init_queue(struct ntb_transport_ctx *nt,
- unsigned int qp_num);
-static void ntb_transport_free_queue(struct ntb_transport_qp *qp);
-static struct ntb_transport_qp *ntb_transport_create_queue(void *data,
- struct ntb_softc *pdev, const struct ntb_queue_handlers *handlers);
-static void ntb_transport_link_up(struct ntb_transport_qp *qp);
-static int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb,
- void *data, unsigned int len);
-static int ntb_process_tx(struct ntb_transport_qp *qp,
- struct ntb_queue_entry *entry);
-static void ntb_memcpy_tx(struct ntb_transport_qp *qp,
- struct ntb_queue_entry *entry, void *offset);
+static void ntb_handle_tx(void *arg, int pending);
static void ntb_qp_full(void *arg);
-static void ntb_transport_rxc_db(void *arg, int pending);
-static int ntb_process_rxc(struct ntb_transport_qp *qp);
-static void ntb_memcpy_rx(struct ntb_transport_qp *qp,
- struct ntb_queue_entry *entry, void *offset);
-static inline void ntb_rx_copy_callback(struct ntb_transport_qp *qp,
- void *data);
-static void ntb_complete_rxc(void *arg, int pending);
-static void ntb_transport_doorbell_callback(void *data, uint32_t vector);
-static void ntb_transport_event_callback(void *data);
-static void ntb_transport_link_work(void *arg);
-static int ntb_set_mw(struct ntb_transport_ctx *, int num_mw, size_t size);
-static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw);
-static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
- unsigned int qp_num);
-static void ntb_qp_link_work(void *arg);
-static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt);
-static void ntb_transport_link_cleanup_work(void *, int);
-static void ntb_qp_link_down(struct ntb_transport_qp *qp);
-static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp);
-static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp);
-static void ntb_transport_link_down(struct ntb_transport_qp *qp);
-static void ntb_send_link_down(struct ntb_transport_qp *qp);
-static void ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
- struct ntb_queue_list *list);
-static struct ntb_queue_entry *ntb_list_rm(struct mtx *lock,
- struct ntb_queue_list *list);
-static struct ntb_queue_entry *ntb_list_mv(struct mtx *lock,
- struct ntb_queue_list *from, struct ntb_queue_list *to);
+static void ntb_qflush(struct ifnet *ifp);
static void create_random_local_eui48(u_char *eaddr);
-static unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
-static void xeon_link_watchdog_hb(void *);
-
-static const struct ntb_ctx_ops ntb_transport_ops = {
- .link_event = ntb_transport_event_callback,
- .db_event = ntb_transport_doorbell_callback,
-};
-MALLOC_DEFINE(M_NTB_IF, "if_ntb", "ntb network driver");
-
-static inline void
-iowrite32(uint32_t val, void *addr)
-{
-
- bus_space_write_4(X86_BUS_SPACE_MEM, 0/* HACK */, (uintptr_t)addr,
- val);
-}
-
-/* Module setup and teardown */
static int
-ntb_handle_module_events(struct module *m, int what, void *arg)
+ntb_net_probe(device_t dev)
{
- int err = 0;
- switch (what) {
- case MOD_LOAD:
- err = ntb_setup_interface();
- break;
- case MOD_UNLOAD:
- err = ntb_teardown_interface();
- break;
- default:
- err = EOPNOTSUPP;
- break;
- }
- return (err);
+ device_set_desc(dev, "NTB Network Interface");
+ return (0);
}
-static moduledata_t if_ntb_mod = {
- "if_ntb",
- ntb_handle_module_events,
- NULL
-};
-
-DECLARE_MODULE(if_ntb, if_ntb_mod, SI_SUB_KLD, SI_ORDER_ANY);
-MODULE_DEPEND(if_ntb, ntb_hw, 1, 1, 1);
-
static int
-ntb_setup_interface(void)
+ntb_net_attach(device_t dev)
{
+ struct ntb_net_ctx *sc = device_get_softc(dev);
+ struct ntb_net_queue *q;
struct ifnet *ifp;
struct ntb_queue_handlers handlers = { ntb_net_rx_handler,
ntb_net_tx_handler, ntb_net_event_handler };
- int rc;
-
- net_softc.ntb = devclass_get_softc(devclass_find("ntb_hw"), 0);
- if (net_softc.ntb == NULL) {
- printf("ntb: Cannot find devclass\n");
- return (ENXIO);
- }
+ int i;
- ifp = net_softc.ifp = if_alloc(IFT_ETHER);
+ ifp = sc->ifp = if_alloc(IFT_ETHER);
if (ifp == NULL) {
- ntb_transport_free(&net_softc);
printf("ntb: Cannot allocate ifnet structure\n");
return (ENOMEM);
}
- if_initname(ifp, "ntb", 0);
-
- rc = ntb_transport_probe(net_softc.ntb);
- if (rc != 0) {
- printf("ntb: Cannot init transport: %d\n", rc);
- if_free(net_softc.ifp);
- return (rc);
- }
+ if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+
+ sc->num_queues = min(g_if_ntb_num_queues,
+ ntb_transport_queue_count(dev));
+ sc->queues = malloc(sc->num_queues * sizeof(struct ntb_net_queue),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ sc->mtu = INT_MAX;
+ for (i = 0; i < sc->num_queues; i++) {
+ q = &sc->queues[i];
+ q->sc = sc;
+ q->ifp = ifp;
+ q->qp = ntb_transport_create_queue(dev, i, &handlers, q);
+ if (q->qp == NULL)
+ break;
+ sc->mtu = imin(sc->mtu, ntb_transport_max_size(q->qp));
+ mtx_init(&q->tx_lock, "ntb tx", NULL, MTX_DEF);
+ q->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &q->tx_lock);
+ TASK_INIT(&q->tx_task, 0, ntb_handle_tx, q);
+ q->tx_tq = taskqueue_create_fast("ntb_txq", M_NOWAIT,
+ taskqueue_thread_enqueue, &q->tx_tq);
+ taskqueue_start_threads(&q->tx_tq, 1, PI_NET, "%s txq%d",
+ device_get_nameunit(dev), i);
+ callout_init(&q->queue_full, 1);
+ }
+ sc->num_queues = i;
+ device_printf(dev, "%d queue(s)\n", sc->num_queues);
- net_softc.qp = ntb_transport_create_queue(ifp, net_softc.ntb,
- &handlers);
ifp->if_init = ntb_net_init;
- ifp->if_softc = &net_softc;
- ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX;
+ ifp->if_softc = sc;
+ ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_ioctl = ntb_ioctl;
- ifp->if_start = ntb_start;
- IFQ_SET_MAXLEN(&ifp->if_snd, IFQ_MAXLEN);
- ifp->if_snd.ifq_drv_maxlen = IFQ_MAXLEN;
- IFQ_SET_READY(&ifp->if_snd);
- create_random_local_eui48(net_softc.eaddr);
- ether_ifattach(ifp, net_softc.eaddr);
- ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_JUMBO_MTU;
- ifp->if_capenable = ifp->if_capabilities;
- ifp->if_mtu = ntb_transport_max_size(net_softc.qp) - ETHER_HDR_LEN -
- ETHER_CRC_LEN;
-
- ntb_transport_link_up(net_softc.qp);
- net_softc.bufsize = ntb_transport_max_size(net_softc.qp) +
- sizeof(struct ether_header);
+ ifp->if_transmit = ntb_transmit;
+ ifp->if_qflush = ntb_qflush;
+ create_random_local_eui48(sc->eaddr);
+ ether_ifattach(ifp, sc->eaddr);
+ ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 |
+ IFCAP_JUMBO_MTU | IFCAP_LINKSTATE;
+ ifp->if_capenable = IFCAP_JUMBO_MTU | IFCAP_LINKSTATE;
+ ifp->if_mtu = sc->mtu - ETHER_HDR_LEN;
+
+ ifmedia_init(&sc->media, IFM_IMASK, ntb_ifmedia_upd,
+ ntb_ifmedia_sts);
+ ifmedia_add(&sc->media, NTB_MEDIATYPE, 0, NULL);
+ ifmedia_set(&sc->media, NTB_MEDIATYPE);
+
+ for (i = 0; i < sc->num_queues; i++)
+ ntb_transport_link_up(sc->queues[i].qp);
return (0);
}
static int
-ntb_teardown_interface(void)
+ntb_net_detach(device_t dev)
{
+ struct ntb_net_ctx *sc = device_get_softc(dev);
+ struct ntb_net_queue *q;
+ int i;
- if (net_softc.qp != NULL) {
- ntb_transport_link_down(net_softc.qp);
-
- ntb_transport_free_queue(net_softc.qp);
- ntb_transport_free(&net_softc);
- }
-
- if (net_softc.ifp != NULL) {
- ether_ifdetach(net_softc.ifp);
- if_free(net_softc.ifp);
- net_softc.ifp = NULL;
- }
-
+ for (i = 0; i < sc->num_queues; i++)
+ ntb_transport_link_down(sc->queues[i].qp);
+ ether_ifdetach(sc->ifp);
+ if_free(sc->ifp);
+ ifmedia_removeall(&sc->media);
+ for (i = 0; i < sc->num_queues; i++) {
+ q = &sc->queues[i];
+ ntb_transport_free_queue(q->qp);
+ buf_ring_free(q->br, M_DEVBUF);
+ callout_drain(&q->queue_full);
+ taskqueue_drain_all(q->tx_tq);
+ mtx_destroy(&q->tx_lock);
+ }
+ free(sc->queues, M_DEVBUF);
return (0);
}
@@ -476,27 +220,26 @@ ntb_teardown_interface(void)
static void
ntb_net_init(void *arg)
{
- struct ntb_transport_ctx *ntb_softc = arg;
- struct ifnet *ifp = ntb_softc->ifp;
+ struct ntb_net_ctx *sc = arg;
+ struct ifnet *ifp = sc->ifp;
ifp->if_drv_flags |= IFF_DRV_RUNNING;
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
- ifp->if_flags |= IFF_UP;
- if_link_state_change(ifp, LINK_STATE_UP);
+ if_link_state_change(ifp, ntb_transport_link_query(sc->queues[0].qp) ?
+ LINK_STATE_UP : LINK_STATE_DOWN);
}
static int
ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
{
- struct ntb_transport_ctx *nt = ifp->if_softc;
+ struct ntb_net_ctx *sc = ifp->if_softc;
struct ifreq *ifr = (struct ifreq *)data;
int error = 0;
switch (command) {
case SIOCSIFMTU:
{
- if (ifr->ifr_mtu > ntb_transport_max_size(nt->qp) -
- ETHER_HDR_LEN - ETHER_CRC_LEN) {
+ if (ifr->ifr_mtu > sc->mtu - ETHER_HDR_LEN) {
error = EINVAL;
break;
}
@@ -504,1185 +247,242 @@ ntb_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
ifp->if_mtu = ifr->ifr_mtu;
break;
}
- default:
- error = ether_ioctl(ifp, command, data);
- break;
- }
-
- return (error);
-}
+ case SIOCSIFMEDIA:
+ case SIOCGIFMEDIA:
+ error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
+ break;
-static void
-ntb_start(struct ifnet *ifp)
-{
- struct mbuf *m_head;
- struct ntb_transport_ctx *nt = ifp->if_softc;
- int rc;
-
- mtx_lock(&nt->tx_lock);
- ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
- CTR0(KTR_NTB, "TX: ntb_start");
- while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
- IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
- CTR1(KTR_NTB, "TX: start mbuf %p", m_head);
- rc = ntb_transport_tx_enqueue(nt->qp, m_head, m_head,
- m_length(m_head, NULL));
- if (rc != 0) {
- CTR1(KTR_NTB,
- "TX: could not tx mbuf %p. Returning to snd q",
- m_head);
- if (rc == EAGAIN) {
- ifp->if_drv_flags |= IFF_DRV_OACTIVE;
- IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
- callout_reset(&nt->qp->queue_full, hz / 1000,
- ntb_qp_full, ifp);
- }
- break;
+ case SIOCSIFCAP:
+ if (ifr->ifr_reqcap & IFCAP_RXCSUM)
+ ifp->if_capenable |= IFCAP_RXCSUM;
+ else
+ ifp->if_capenable &= ~IFCAP_RXCSUM;
+ if (ifr->ifr_reqcap & IFCAP_TXCSUM) {
+ ifp->if_capenable |= IFCAP_TXCSUM;
+ ifp->if_hwassist |= NTB_CSUM_FEATURES;
+ } else {
+ ifp->if_capenable &= ~IFCAP_TXCSUM;
+ ifp->if_hwassist &= ~NTB_CSUM_FEATURES;
+ }
+ if (ifr->ifr_reqcap & IFCAP_RXCSUM_IPV6)
+ ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
+ else
+ ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
+ if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6) {
+ ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
+ ifp->if_hwassist |= NTB_CSUM_FEATURES6;
+ } else {
+ ifp->if_capenable &= ~IFCAP_TXCSUM_IPV6;
+ ifp->if_hwassist &= ~NTB_CSUM_FEATURES6;
}
-
- }
- mtx_unlock(&nt->tx_lock);
-}
-
-/* Network Device Callbacks */
-static void
-ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
- int len)
-{
-
- m_freem(data);
- CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
-}
-
-static void
-ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
- int len)
-{
- struct mbuf *m = data;
- struct ifnet *ifp = qp_data;
-
- CTR0(KTR_NTB, "RX: rx handler");
- (*ifp->if_input)(ifp, m);
-}
-
-static void
-ntb_net_event_handler(void *data, enum ntb_link_event status)
-{
- struct ifnet *ifp;
-
- ifp = data;
- (void)ifp;
-
- /* XXX The Linux driver munges with the carrier status here. */
-
- switch (status) {
- case NTB_LINK_DOWN:
- break;
- case NTB_LINK_UP:
break;
- default:
- panic("Bogus ntb_link_event %u\n", status);
- }
-}
-
-/* Transport Init and teardown */
-
-static void
-xeon_link_watchdog_hb(void *arg)
-{
- struct ntb_transport_ctx *nt;
-
- nt = arg;
- ntb_spad_write(nt->ntb, IF_NTB_WATCHDOG_SPAD, 0);
- callout_reset(&nt->link_watchdog, 1 * hz, xeon_link_watchdog_hb, nt);
-}
-
-static int
-ntb_transport_probe(struct ntb_softc *ntb)
-{
- struct ntb_transport_ctx *nt = &net_softc;
- struct ntb_transport_mw *mw;
- uint64_t qp_bitmap;
- int rc;
- unsigned i;
-
- nt->mw_count = ntb_mw_count(ntb);
- for (i = 0; i < nt->mw_count; i++) {
- mw = &nt->mw_vec[i];
-
- rc = ntb_mw_get_range(ntb, i, &mw->phys_addr, &mw->vbase,
- &mw->phys_size, &mw->xlat_align, &mw->xlat_align_size,
- &mw->addr_limit);
- if (rc != 0)
- goto err;
-
- mw->buff_size = 0;
- mw->xlat_size = 0;
- mw->virt_addr = NULL;
- mw->dma_addr = 0;
-
- rc = ntb_mw_set_wc(nt->ntb, i, VM_MEMATTR_WRITE_COMBINING);
- if (rc)
- ntb_printf(0, "Unable to set mw%d caching\n", i);
- }
-
- qp_bitmap = ntb_db_valid_mask(ntb);
- nt->qp_count = flsll(qp_bitmap);
- KASSERT(nt->qp_count != 0, ("bogus db bitmap"));
- nt->qp_count -= 1;
-
- if (max_num_clients != 0 && max_num_clients < nt->qp_count)
- nt->qp_count = max_num_clients;
- else if (nt->mw_count < nt->qp_count)
- nt->qp_count = nt->mw_count;
- KASSERT(nt->qp_count <= QP_SETSIZE, ("invalid qp_count"));
-
- mtx_init(&nt->tx_lock, "ntb transport tx", NULL, MTX_DEF);
- mtx_init(&nt->rx_lock, "ntb transport rx", NULL, MTX_DEF);
-
- nt->qp_vec = malloc(nt->qp_count * sizeof(*nt->qp_vec), M_NTB_IF,
- M_WAITOK | M_ZERO);
-
- for (i = 0; i < nt->qp_count; i++) {
- set_bit(i, &nt->qp_bitmap);
- set_bit(i, &nt->qp_bitmap_free);
- ntb_transport_init_queue(nt, i);
- }
-
- callout_init(&nt->link_work, 0);
- callout_init(&nt->link_watchdog, 0);
- TASK_INIT(&nt->link_cleanup, 0, ntb_transport_link_cleanup_work, nt);
-
- rc = ntb_set_ctx(ntb, nt, &ntb_transport_ops);
- if (rc != 0)
- goto err;
-
- nt->link_is_up = false;
- ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
- ntb_link_event(ntb);
-
- callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
- if (enable_xeon_watchdog != 0)
- callout_reset(&nt->link_watchdog, 0, xeon_link_watchdog_hb, nt);
- return (0);
-
-err:
- free(nt->qp_vec, M_NTB_IF);
- nt->qp_vec = NULL;
- return (rc);
-}
-
-static void
-ntb_transport_free(struct ntb_transport_ctx *nt)
-{
- struct ntb_softc *ntb = nt->ntb;
- struct _qpset qp_bitmap_alloc;
- uint8_t i;
-
- ntb_transport_link_cleanup(nt);
- taskqueue_drain(taskqueue_swi, &nt->link_cleanup);
- callout_drain(&nt->link_work);
- callout_drain(&nt->link_watchdog);
-
- BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
- BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
- /* Verify that all the QPs are freed */
- for (i = 0; i < nt->qp_count; i++)
- if (test_bit(i, &qp_bitmap_alloc))
- ntb_transport_free_queue(&nt->qp_vec[i]);
-
- ntb_link_disable(ntb);
- ntb_clear_ctx(ntb);
-
- for (i = 0; i < nt->mw_count; i++)
- ntb_free_mw(nt, i);
-
- free(nt->qp_vec, M_NTB_IF);
-}
-
-static void
-ntb_transport_init_queue(struct ntb_transport_ctx *nt, unsigned int qp_num)
-{
- struct ntb_transport_mw *mw;
- struct ntb_transport_qp *qp;
- vm_paddr_t mw_base;
- uint64_t mw_size, qp_offset;
- size_t tx_size;
- unsigned num_qps_mw, mw_num, mw_count;
-
- mw_count = nt->mw_count;
- mw_num = QP_TO_MW(nt, qp_num);
- mw = &nt->mw_vec[mw_num];
-
- qp = &nt->qp_vec[qp_num];
- qp->qp_num = qp_num;
- qp->transport = nt;
- qp->ntb = nt->ntb;
- qp->client_ready = false;
- qp->event_handler = NULL;
- ntb_qp_link_down_reset(qp);
-
- if (nt->qp_count % mw_count && mw_num + 1 < nt->qp_count / mw_count)
- num_qps_mw = nt->qp_count / mw_count + 1;
- else
- num_qps_mw = nt->qp_count / mw_count;
-
- mw_base = mw->phys_addr;
- mw_size = mw->phys_size;
-
- tx_size = mw_size / num_qps_mw;
- qp_offset = tx_size * (qp_num / mw_count);
-
- qp->tx_mw = mw->vbase + qp_offset;
- KASSERT(qp->tx_mw != NULL, ("uh oh?"));
-
- /* XXX Assumes that a vm_paddr_t is equivalent to bus_addr_t */
- qp->tx_mw_phys = mw_base + qp_offset;
- KASSERT(qp->tx_mw_phys != 0, ("uh oh?"));
-
- tx_size -= sizeof(struct ntb_rx_info);
- qp->rx_info = (void *)(qp->tx_mw + tx_size);
-
- /* Due to house-keeping, there must be at least 2 buffs */
- qp->tx_max_frame = qmin(tx_size / 2,
- transport_mtu + sizeof(struct ntb_payload_header));
- qp->tx_max_entry = tx_size / qp->tx_max_frame;
-
- callout_init(&qp->link_work, 0);
- callout_init(&qp->queue_full, CALLOUT_MPSAFE);
- callout_init(&qp->rx_full, CALLOUT_MPSAFE);
-
- mtx_init(&qp->ntb_rx_q_lock, "ntb rx q", NULL, MTX_SPIN);
- mtx_init(&qp->ntb_tx_free_q_lock, "ntb tx free q", NULL, MTX_SPIN);
- TASK_INIT(&qp->rx_completion_task, 0, ntb_complete_rxc, qp);
- TASK_INIT(&qp->rxc_db_work, 0, ntb_transport_rxc_db, qp);
-
- STAILQ_INIT(&qp->rx_post_q);
- STAILQ_INIT(&qp->rx_pend_q);
- STAILQ_INIT(&qp->tx_free_q);
- callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
-}
-
-static void
-ntb_transport_free_queue(struct ntb_transport_qp *qp)
-{
- struct ntb_queue_entry *entry;
-
- if (qp == NULL)
- return;
-
- callout_drain(&qp->link_work);
-
- ntb_db_set_mask(qp->ntb, 1ull << qp->qp_num);
- taskqueue_drain(taskqueue_swi, &qp->rxc_db_work);
- taskqueue_drain(taskqueue_swi, &qp->rx_completion_task);
-
- qp->cb_data = NULL;
- qp->rx_handler = NULL;
- qp->tx_handler = NULL;
- qp->event_handler = NULL;
-
- while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_pend_q)))
- free(entry, M_NTB_IF);
-
- while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_post_q)))
- free(entry, M_NTB_IF);
-
- while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
- free(entry, M_NTB_IF);
-
- set_bit(qp->qp_num, &qp->transport->qp_bitmap_free);
-}
-
-/**
- * ntb_transport_create_queue - Create a new NTB transport layer queue
- * @rx_handler: receive callback function
- * @tx_handler: transmit callback function
- * @event_handler: event callback function
- *
- * Create a new NTB transport layer queue and provide the queue with a callback
- * routine for both transmit and receive. The receive callback routine will be
- * used to pass up data when the transport has received it on the queue. The
- * transmit callback routine will be called when the transport has completed the
- * transmission of the data on the queue and the data is ready to be freed.
- *
- * RETURNS: pointer to newly created ntb_queue, NULL on error.
- */
-static struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct ntb_softc *ntb,
- const struct ntb_queue_handlers *handlers)
-{
- struct ntb_queue_entry *entry;
- struct ntb_transport_qp *qp;
- struct ntb_transport_ctx *nt;
- unsigned int free_queue;
- int i;
-
- nt = ntb_get_ctx(ntb, NULL);
- KASSERT(nt != NULL, ("bogus"));
-
- free_queue = ffs_bit(&nt->qp_bitmap);
- if (free_queue == 0)
- return (NULL);
-
- /* decrement free_queue to make it zero based */
- free_queue--;
-
- qp = &nt->qp_vec[free_queue];
- clear_bit(qp->qp_num, &nt->qp_bitmap_free);
- qp->cb_data = data;
- qp->rx_handler = handlers->rx_handler;
- qp->tx_handler = handlers->tx_handler;
- qp->event_handler = handlers->event_handler;
-
- for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
- entry = malloc(sizeof(*entry), M_NTB_IF, M_WAITOK | M_ZERO);
- entry->cb_data = nt->ifp;
- entry->buf = NULL;
- entry->len = transport_mtu;
- ntb_list_add(&qp->ntb_rx_q_lock, entry, &qp->rx_pend_q);
- }
-
- for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
- entry = malloc(sizeof(*entry), M_NTB_IF, M_WAITOK | M_ZERO);
- ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+ default:
+ error = ether_ioctl(ifp, command, data);
+ break;
}
- ntb_db_clear(ntb, 1ull << qp->qp_num);
- ntb_db_clear_mask(ntb, 1ull << qp->qp_num);
- return (qp);
-}
-
-/**
- * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
- * @qp: NTB transport layer queue to be enabled
- *
- * Notify NTB transport layer of client readiness to use queue
- */
-static void
-ntb_transport_link_up(struct ntb_transport_qp *qp)
-{
- struct ntb_transport_ctx *nt;
-
- if (qp == NULL)
- return;
-
- qp->client_ready = true;
-
- nt = qp->transport;
- ntb_printf(2, "qp client ready\n");
-
- if (qp->transport->link_is_up)
- callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+ return (error);
}
-
-
-/* Transport Tx */
-
-/**
- * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
- * @qp: NTB transport layer queue the entry is to be enqueued on
- * @cb: per buffer pointer for callback function to use
- * @data: pointer to data buffer that will be sent
- * @len: length of the data buffer
- *
- * Enqueue a new transmit buffer onto the transport queue from which a NTB
- * payload will be transmitted. This assumes that a lock is being held to
- * serialize access to the qp.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
static int
-ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
- unsigned int len)
+ntb_ifmedia_upd(struct ifnet *ifp)
{
- struct ntb_queue_entry *entry;
- int rc;
+ struct ntb_net_ctx *sc = ifp->if_softc;
+ struct ifmedia *ifm = &sc->media;
- if (qp == NULL || !qp->link_is_up || len == 0) {
- CTR0(KTR_NTB, "TX: link not up");
+ if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
return (EINVAL);
- }
-
- entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
- if (entry == NULL) {
- CTR0(KTR_NTB, "TX: could not get entry from tx_free_q");
- qp->tx_err_no_buf++;
- return (EBUSY);
- }
- CTR1(KTR_NTB, "TX: got entry %p from tx_free_q", entry);
-
- entry->cb_data = cb;
- entry->buf = data;
- entry->len = len;
- entry->flags = 0;
-
- rc = ntb_process_tx(qp, entry);
- if (rc != 0) {
- ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
- CTR1(KTR_NTB,
- "TX: process_tx failed. Returning entry %p to tx_free_q",
- entry);
- }
- return (rc);
-}
-
-static int
-ntb_process_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
-{
- void *offset;
-
- offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
- CTR3(KTR_NTB,
- "TX: process_tx: tx_pkts=%lu, tx_index=%u, remote entry=%u",
- qp->tx_pkts, qp->tx_index, qp->remote_rx_info->entry);
- if (qp->tx_index == qp->remote_rx_info->entry) {
- CTR0(KTR_NTB, "TX: ring full");
- qp->tx_ring_full++;
- return (EAGAIN);
- }
-
- if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
- if (qp->tx_handler != NULL)
- qp->tx_handler(qp, qp->cb_data, entry->buf,
- EIO);
- else
- m_freem(entry->buf);
-
- entry->buf = NULL;
- ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
- CTR1(KTR_NTB,
- "TX: frame too big. returning entry %p to tx_free_q",
- entry);
- return (0);
- }
- CTR2(KTR_NTB, "TX: copying entry %p to offset %p", entry, offset);
- ntb_memcpy_tx(qp, entry, offset);
-
- qp->tx_index++;
- qp->tx_index %= qp->tx_max_entry;
-
- qp->tx_pkts++;
return (0);
}
static void
-ntb_memcpy_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
- void *offset)
-{
- struct ntb_payload_header *hdr;
-
- /* This piece is from Linux' ntb_async_tx() */
- hdr = (struct ntb_payload_header *)((char *)offset + qp->tx_max_frame -
- sizeof(struct ntb_payload_header));
- entry->x_hdr = hdr;
- iowrite32(entry->len, &hdr->len);
- iowrite32(qp->tx_pkts, &hdr->ver);
-
- /* This piece is ntb_memcpy_tx() */
- CTR2(KTR_NTB, "TX: copying %d bytes to offset %p", entry->len, offset);
- if (entry->buf != NULL) {
- m_copydata((struct mbuf *)entry->buf, 0, entry->len, offset);
-
- /*
- * Ensure that the data is fully copied before setting the
- * flags
- */
- wmb();
- }
-
- /* The rest is ntb_tx_copy_callback() */
- iowrite32(entry->flags | IF_NTB_DESC_DONE_FLAG, &hdr->flags);
- CTR1(KTR_NTB, "TX: hdr %p set DESC_DONE", hdr);
-
- ntb_peer_db_set(qp->ntb, 1ull << qp->qp_num);
-
- /*
- * The entry length can only be zero if the packet is intended to be a
- * "link down" or similar. Since no payload is being sent in these
- * cases, there is nothing to add to the completion queue.
- */
- if (entry->len > 0) {
- qp->tx_bytes += entry->len;
-
- if (qp->tx_handler)
- qp->tx_handler(qp, qp->cb_data, entry->buf,
- entry->len);
- else
- m_freem(entry->buf);
- entry->buf = NULL;
- }
-
- CTR3(KTR_NTB,
- "TX: entry %p sent. hdr->ver = %u, hdr->flags = 0x%x, Returning "
- "to tx_free_q", entry, hdr->ver, hdr->flags);
- ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
-}
-
-static void
-ntb_qp_full(void *arg)
+ntb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
{
+ struct ntb_net_ctx *sc = ifp->if_softc;
- CTR0(KTR_NTB, "TX: qp_full callout");
- ntb_start(arg);
+ ifmr->ifm_status = IFM_AVALID;
+ ifmr->ifm_active = NTB_MEDIATYPE;
+ if (ntb_transport_link_query(sc->queues[0].qp))
+ ifmr->ifm_status |= IFM_ACTIVE;
}
-/* Transport Rx */
static void
-ntb_transport_rxc_db(void *arg, int pending __unused)
+ntb_transmit_locked(struct ntb_net_queue *q)
{
- struct ntb_transport_qp *qp = arg;
- ntb_q_idx_t i;
- int rc;
-
- /*
- * Limit the number of packets processed in a single interrupt to
- * provide fairness to others
- */
- CTR0(KTR_NTB, "RX: transport_rx");
- mtx_lock(&qp->transport->rx_lock);
- for (i = 0; i < qp->rx_max_entry; i++) {
- rc = ntb_process_rxc(qp);
+ struct ifnet *ifp = q->ifp;
+ struct mbuf *m;
+ int rc, len;
+ short mflags;
+
+ CTR0(KTR_NTB, "TX: ntb_transmit_locked");
+ while ((m = drbr_peek(ifp, q->br)) != NULL) {
+ CTR1(KTR_NTB, "TX: start mbuf %p", m);
+ ETHER_BPF_MTAP(ifp, m);
+ len = m->m_pkthdr.len;
+ mflags = m->m_flags;
+ rc = ntb_transport_tx_enqueue(q->qp, m, m, len);
if (rc != 0) {
- CTR0(KTR_NTB, "RX: process_rxc failed");
+ CTR2(KTR_NTB, "TX: could not tx mbuf %p: %d", m, rc);
+ if (rc == EAGAIN) {
+ drbr_putback(ifp, q->br, m);
+ callout_reset_sbt(&q->queue_full,
+ SBT_1MS / 4, SBT_1MS / 4,
+ ntb_qp_full, q, 0);
+ } else {
+ m_freem(m);
+ drbr_advance(ifp, q->br);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ }
break;
}
- }
- mtx_unlock(&qp->transport->rx_lock);
-
- if (i == qp->rx_max_entry)
- taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
- else if ((ntb_db_read(qp->ntb) & (1ull << qp->qp_num)) != 0) {
- /* If db is set, clear it and read it back to commit clear. */
- ntb_db_clear(qp->ntb, 1ull << qp->qp_num);
- (void)ntb_db_read(qp->ntb);
-
- /*
- * An interrupt may have arrived between finishing
- * ntb_process_rxc and clearing the doorbell bit: there might
- * be some more work to do.
- */
- taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
+ drbr_advance(ifp, q->br);
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
+ if (mflags & M_MCAST)
+ if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
}
}
static int
-ntb_process_rxc(struct ntb_transport_qp *qp)
+ntb_transmit(struct ifnet *ifp, struct mbuf *m)
{
- struct ntb_payload_header *hdr;
- struct ntb_queue_entry *entry;
- caddr_t offset;
-
- offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
- hdr = (void *)(offset + qp->rx_max_frame -
- sizeof(struct ntb_payload_header));
-
- CTR1(KTR_NTB, "RX: process_rxc rx_index = %u", qp->rx_index);
- if ((hdr->flags & IF_NTB_DESC_DONE_FLAG) == 0) {
- CTR0(KTR_NTB, "RX: hdr not done");
- qp->rx_ring_empty++;
- return (EAGAIN);
- }
-
- if ((hdr->flags & IF_NTB_LINK_DOWN_FLAG) != 0) {
- CTR0(KTR_NTB, "RX: link down");
- ntb_qp_link_down(qp);
- hdr->flags = 0;
- return (EAGAIN);
- }
-
- if (hdr->ver != (uint32_t)qp->rx_pkts) {
- CTR2(KTR_NTB,"RX: ver != rx_pkts (%x != %lx). "
- "Returning entry to rx_pend_q", hdr->ver, qp->rx_pkts);
- qp->rx_err_ver++;
- return (EIO);
- }
-
- entry = ntb_list_mv(&qp->ntb_rx_q_lock, &qp->rx_pend_q, &qp->rx_post_q);
- if (entry == NULL) {
- qp->rx_err_no_buf++;
- CTR0(KTR_NTB, "RX: No entries in rx_pend_q");
- return (EAGAIN);
- }
- callout_stop(&qp->rx_full);
- CTR1(KTR_NTB, "RX: rx entry %p from rx_pend_q", entry);
-
- entry->x_hdr = hdr;
- entry->index = qp->rx_index;
-
- if (hdr->len > entry->len) {
- CTR2(KTR_NTB, "RX: len too long. Wanted %ju got %ju",
- (uintmax_t)hdr->len, (uintmax_t)entry->len);
- qp->rx_err_oflow++;
-
- entry->len = -EIO;
- entry->flags |= IF_NTB_DESC_DONE_FLAG;
+ struct ntb_net_ctx *sc = ifp->if_softc;
+ struct ntb_net_queue *q;
+ int error, i;
- taskqueue_enqueue(taskqueue_swi, &qp->rx_completion_task);
- } else {
- qp->rx_bytes += hdr->len;
- qp->rx_pkts++;
-
- CTR1(KTR_NTB, "RX: received %ld rx_pkts", qp->rx_pkts);
-
- entry->len = hdr->len;
-
- ntb_memcpy_rx(qp, entry, offset);
- }
-
- qp->rx_index++;
- qp->rx_index %= qp->rx_max_entry;
+ CTR0(KTR_NTB, "TX: ntb_transmit");
+ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+ i = m->m_pkthdr.flowid % sc->num_queues;
+ else
+ i = curcpu % sc->num_queues;
+ q = &sc->queues[i];
+
+ error = drbr_enqueue(ifp, q->br, m);
+ if (error)
+ return (error);
+
+ if (mtx_trylock(&q->tx_lock)) {
+ ntb_transmit_locked(q);
+ mtx_unlock(&q->tx_lock);
+ } else
+ taskqueue_enqueue(q->tx_tq, &q->tx_task);
return (0);
}
static void
-ntb_memcpy_rx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
- void *offset)
+ntb_handle_tx(void *arg, int pending)
{
- struct ifnet *ifp = entry->cb_data;
- unsigned int len = entry->len;
- struct mbuf *m;
-
- CTR2(KTR_NTB, "RX: copying %d bytes from offset %p", len, offset);
- m = m_devget(offset, len, 0, ifp, NULL);
- m->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
+ struct ntb_net_queue *q = arg;
- entry->buf = (void *)m;
-
- /* Ensure that the data is globally visible before clearing the flag */
- wmb();
-
- CTR2(KTR_NTB, "RX: copied entry %p to mbuf %p.", entry, m);
- ntb_rx_copy_callback(qp, entry);
+ mtx_lock(&q->tx_lock);
+ ntb_transmit_locked(q);
+ mtx_unlock(&q->tx_lock);
}
-static inline void
-ntb_rx_copy_callback(struct ntb_transport_qp *qp, void *data)
+static void
+ntb_qp_full(void *arg)
{
- struct ntb_queue_entry *entry;
+ struct ntb_net_queue *q = arg;
- entry = data;
- entry->flags |= IF_NTB_DESC_DONE_FLAG;
- taskqueue_enqueue(taskqueue_swi, &qp->rx_completion_task);
+ CTR0(KTR_NTB, "TX: qp_full callout");
+ if (ntb_transport_tx_free_entry(q->qp) > 0)
+ taskqueue_enqueue(q->tx_tq, &q->tx_task);
+ else
+ callout_schedule_sbt(&q->queue_full,
+ SBT_1MS / 4, SBT_1MS / 4, 0);
}
static void
-ntb_complete_rxc(void *arg, int pending)
+ntb_qflush(struct ifnet *ifp)
{
- struct ntb_transport_qp *qp = arg;
- struct ntb_queue_entry *entry;
+ struct ntb_net_ctx *sc = ifp->if_softc;
+ struct ntb_net_queue *q;
struct mbuf *m;
- unsigned len;
-
- CTR0(KTR_NTB, "RX: rx_completion_task");
-
- mtx_lock_spin(&qp->ntb_rx_q_lock);
-
- while (!STAILQ_EMPTY(&qp->rx_post_q)) {
- entry = STAILQ_FIRST(&qp->rx_post_q);
- if ((entry->flags & IF_NTB_DESC_DONE_FLAG) == 0)
- break;
-
- entry->x_hdr->flags = 0;
- iowrite32(entry->index, &qp->rx_info->entry);
-
- STAILQ_REMOVE_HEAD(&qp->rx_post_q, entry);
-
- len = entry->len;
- m = entry->buf;
-
- /*
- * Re-initialize queue_entry for reuse; rx_handler takes
- * ownership of the mbuf.
- */
- entry->buf = NULL;
- entry->len = transport_mtu;
- entry->cb_data = qp->transport->ifp;
-
- STAILQ_INSERT_TAIL(&qp->rx_pend_q, entry, entry);
-
- mtx_unlock_spin(&qp->ntb_rx_q_lock);
+ int i;
- CTR2(KTR_NTB, "RX: completing entry %p, mbuf %p", entry, m);
- if (qp->rx_handler != NULL && qp->client_ready)
- qp->rx_handler(qp, qp->cb_data, m, len);
- else
+ for (i = 0; i < sc->num_queues; i++) {
+ q = &sc->queues[i];
+ mtx_lock(&q->tx_lock);
+ while ((m = buf_ring_dequeue_sc(q->br)) != NULL)
m_freem(m);
-
- mtx_lock_spin(&qp->ntb_rx_q_lock);
- }
-
- mtx_unlock_spin(&qp->ntb_rx_q_lock);
-}
-
-static void
-ntb_transport_doorbell_callback(void *data, uint32_t vector)
-{
- struct ntb_transport_ctx *nt = data;
- struct ntb_transport_qp *qp;
- struct _qpset db_bits;
- uint64_t vec_mask;
- unsigned qp_num;
-
- BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &db_bits);
- BIT_NAND(QP_SETSIZE, &db_bits, &nt->qp_bitmap_free);
-
- vec_mask = ntb_db_vector_mask(nt->ntb, vector);
- while (vec_mask != 0) {
- qp_num = ffsll(vec_mask) - 1;
-
- if (test_bit(qp_num, &db_bits)) {
- qp = &nt->qp_vec[qp_num];
- taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
- }
-
- vec_mask &= ~(1ull << qp_num);
- }
-}
-
-/* Link Event handler */
-static void
-ntb_transport_event_callback(void *data)
-{
- struct ntb_transport_ctx *nt = data;
-
- if (ntb_link_is_up(nt->ntb, NULL, NULL)) {
- ntb_printf(1, "HW link up\n");
- callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
- } else {
- ntb_printf(1, "HW link down\n");
- taskqueue_enqueue(taskqueue_swi, &nt->link_cleanup);
+ mtx_unlock(&q->tx_lock);
}
+ if_qflush(ifp);
}
-/* Link bring up */
+/* Network Device Callbacks */
static void
-ntb_transport_link_work(void *arg)
-{
- struct ntb_transport_ctx *nt = arg;
- struct ntb_softc *ntb = nt->ntb;
- struct ntb_transport_qp *qp;
- uint64_t val64, size;
- uint32_t val;
- unsigned i;
- int rc;
-
- /* send the local info, in the opposite order of the way we read it */
- for (i = 0; i < nt->mw_count; i++) {
- size = nt->mw_vec[i].phys_size;
-
- if (max_mw_size != 0 && size > max_mw_size)
- size = max_mw_size;
-
- ntb_peer_spad_write(ntb, IF_NTB_MW0_SZ_HIGH + (i * 2),
- size >> 32);
- ntb_peer_spad_write(ntb, IF_NTB_MW0_SZ_LOW + (i * 2), size);
- }
-
- ntb_peer_spad_write(ntb, IF_NTB_NUM_MWS, nt->mw_count);
-
- ntb_peer_spad_write(ntb, IF_NTB_NUM_QPS, nt->qp_count);
-
- ntb_peer_spad_write(ntb, IF_NTB_VERSION, NTB_TRANSPORT_VERSION);
-
- /* Query the remote side for its info */
- val = 0;
- ntb_spad_read(ntb, IF_NTB_VERSION, &val);
- if (val != NTB_TRANSPORT_VERSION)
- goto out;
-
- ntb_spad_read(ntb, IF_NTB_NUM_QPS, &val);
- if (val != nt->qp_count)
- goto out;
-
- ntb_spad_read(ntb, IF_NTB_NUM_MWS, &val);
- if (val != nt->mw_count)
- goto out;
-
- for (i = 0; i < nt->mw_count; i++) {
- ntb_spad_read(ntb, IF_NTB_MW0_SZ_HIGH + (i * 2), &val);
- val64 = (uint64_t)val << 32;
-
- ntb_spad_read(ntb, IF_NTB_MW0_SZ_LOW + (i * 2), &val);
- val64 |= val;
-
- rc = ntb_set_mw(nt, i, val64);
- if (rc != 0)
- goto free_mws;
- }
-
- nt->link_is_up = true;
- ntb_printf(1, "transport link up\n");
-
- for (i = 0; i < nt->qp_count; i++) {
- qp = &nt->qp_vec[i];
-
- ntb_transport_setup_qp_mw(nt, i);
-
- if (qp->client_ready)
- callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
- }
-
- return;
-
-free_mws:
- for (i = 0; i < nt->mw_count; i++)
- ntb_free_mw(nt, i);
-out:
- if (ntb_link_is_up(ntb, NULL, NULL))
- callout_reset(&nt->link_work,
- NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_transport_link_work, nt);
-}
-
-static int
-ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, size_t size)
+ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
+ int len)
{
- struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
- size_t xlat_size, buff_size;
- int rc;
-
- if (size == 0)
- return (EINVAL);
-
- xlat_size = roundup(size, mw->xlat_align_size);
- buff_size = xlat_size;
-
- /* No need to re-setup */
- if (mw->xlat_size == xlat_size)
- return (0);
-
- if (mw->buff_size != 0)
- ntb_free_mw(nt, num_mw);
-
- /* Alloc memory for receiving data. Must be aligned */
- mw->xlat_size = xlat_size;
- mw->buff_size = buff_size;
-
- mw->virt_addr = contigmalloc(mw->buff_size, M_NTB_IF, M_ZERO, 0,
- mw->addr_limit, mw->xlat_align, 0);
- if (mw->virt_addr == NULL) {
- ntb_printf(0, "Unable to allocate MW buffer of size %zu/%zu\n",
- mw->buff_size, mw->xlat_size);
- mw->xlat_size = 0;
- mw->buff_size = 0;
- return (ENOMEM);
- }
- /* TODO: replace with bus_space_* functions */
- mw->dma_addr = vtophys(mw->virt_addr);
-
- /*
- * Ensure that the allocation from contigmalloc is aligned as
- * requested. XXX: This may not be needed -- brought in for parity
- * with the Linux driver.
- */
- if (mw->dma_addr % mw->xlat_align != 0) {
- ntb_printf(0,
- "DMA memory 0x%jx not aligned to BAR size 0x%zx\n",
- (uintmax_t)mw->dma_addr, size);
- ntb_free_mw(nt, num_mw);
- return (ENOMEM);
- }
-
- /* Notify HW the memory location of the receive buffer */
- rc = ntb_mw_set_trans(nt->ntb, num_mw, mw->dma_addr, mw->xlat_size);
- if (rc) {
- ntb_printf(0, "Unable to set mw%d translation\n", num_mw);
- ntb_free_mw(nt, num_mw);
- return (rc);
- }
- return (0);
+ m_freem(data);
+ CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
}
static void
-ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
+ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
+ int len)
{
- struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+ struct ntb_net_queue *q = qp_data;
+ struct ntb_net_ctx *sc = q->sc;
+ struct mbuf *m = data;
+ struct ifnet *ifp = q->ifp;
+ uint16_t proto;
- if (mw->virt_addr == NULL)
+ CTR1(KTR_NTB, "RX: rx handler (%d)", len);
+ if (len < 0) {
+ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
return;
-
- ntb_mw_clear_trans(nt->ntb, num_mw);
- contigfree(mw->virt_addr, mw->xlat_size, M_NTB_IF);
- mw->xlat_size = 0;
- mw->buff_size = 0;
- mw->virt_addr = NULL;
-}
-
-static int
-ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, unsigned int qp_num)
-{
- struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
- struct ntb_transport_mw *mw;
- void *offset;
- ntb_q_idx_t i;
- size_t rx_size;
- unsigned num_qps_mw, mw_num, mw_count;
-
- mw_count = nt->mw_count;
- mw_num = QP_TO_MW(nt, qp_num);
- mw = &nt->mw_vec[mw_num];
-
- if (mw->virt_addr == NULL)
- return (ENOMEM);
-
- if (nt->qp_count % mw_count && mw_num + 1 < nt->qp_count / mw_count)
- num_qps_mw = nt->qp_count / mw_count + 1;
- else
- num_qps_mw = nt->qp_count / mw_count;
-
- rx_size = mw->xlat_size / num_qps_mw;
- qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
- rx_size -= sizeof(struct ntb_rx_info);
-
- qp->remote_rx_info = (void*)(qp->rx_buff + rx_size);
-
- /* Due to house-keeping, there must be at least 2 buffs */
- qp->rx_max_frame = qmin(rx_size / 2,
- transport_mtu + sizeof(struct ntb_payload_header));
- qp->rx_max_entry = rx_size / qp->rx_max_frame;
- qp->rx_index = 0;
-
- qp->remote_rx_info->entry = qp->rx_max_entry - 1;
-
- /* Set up the hdr offsets with 0s */
- for (i = 0; i < qp->rx_max_entry; i++) {
- offset = (void *)(qp->rx_buff + qp->rx_max_frame * (i + 1) -
- sizeof(struct ntb_payload_header));
- memset(offset, 0, sizeof(struct ntb_payload_header));
}
- qp->rx_pkts = 0;
- qp->tx_pkts = 0;
- qp->tx_index = 0;
-
- return (0);
-}
-
-static void
-ntb_qp_link_work(void *arg)
-{
- struct ntb_transport_qp *qp = arg;
- struct ntb_softc *ntb = qp->ntb;
- struct ntb_transport_ctx *nt = qp->transport;
- uint32_t val, dummy;
-
- ntb_spad_read(ntb, IF_NTB_QP_LINKS, &val);
-
- ntb_peer_spad_write(ntb, IF_NTB_QP_LINKS, val | (1ull << qp->qp_num));
-
- /* query remote spad for qp ready bits */
- ntb_peer_spad_read(ntb, IF_NTB_QP_LINKS, &dummy);
-
- /* See if the remote side is up */
- if ((val & (1ull << qp->qp_num)) != 0) {
- ntb_printf(2, "qp link up\n");
- qp->link_is_up = true;
-
- if (qp->event_handler != NULL)
- qp->event_handler(qp->cb_data, NTB_LINK_UP);
-
- taskqueue_enqueue(taskqueue_swi, &qp->rxc_db_work);
- } else if (nt->link_is_up)
- callout_reset(&qp->link_work,
- NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
-}
-
-/* Link down event*/
-static void
-ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
-{
- struct ntb_transport_qp *qp;
- struct _qpset qp_bitmap_alloc;
- unsigned i;
-
- BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
- BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
- /* Pass along the info to any clients */
- for (i = 0; i < nt->qp_count; i++)
- if (test_bit(i, &qp_bitmap_alloc)) {
- qp = &nt->qp_vec[i];
- ntb_qp_link_cleanup(qp);
- callout_drain(&qp->link_work);
- }
-
- if (!nt->link_is_up)
- callout_drain(&nt->link_work);
-
- /*
- * The scratchpad registers keep the values if the remote side
- * goes down, blast them now to give them a sane value the next
- * time they are accessed
- */
- for (i = 0; i < IF_NTB_MAX_SPAD; i++)
- ntb_spad_write(nt->ntb, i, 0);
-}
-
-static void
-ntb_transport_link_cleanup_work(void *arg, int pending __unused)
-{
-
- ntb_transport_link_cleanup(arg);
-}
-
-static void
-ntb_qp_link_down(struct ntb_transport_qp *qp)
-{
-
- ntb_qp_link_cleanup(qp);
-}
-
-static void
-ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
-{
-
- qp->link_is_up = false;
-
- qp->tx_index = qp->rx_index = 0;
- qp->tx_bytes = qp->rx_bytes = 0;
- qp->tx_pkts = qp->rx_pkts = 0;
-
- qp->rx_ring_empty = 0;
- qp->tx_ring_full = 0;
-
- qp->rx_err_no_buf = qp->tx_err_no_buf = 0;
- qp->rx_err_oflow = qp->rx_err_ver = 0;
-}
-
-static void
-ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
-{
- struct ntb_transport_ctx *nt = qp->transport;
-
- callout_drain(&qp->link_work);
- ntb_qp_link_down_reset(qp);
-
- if (qp->event_handler != NULL)
- qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
-
- if (nt->link_is_up)
- callout_reset(&qp->link_work,
- NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
-}
-
-/* Link commanded down */
-/**
- * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
- * @qp: NTB transport layer queue to be disabled
- *
- * Notify NTB transport layer of client's desire to no longer receive data on
- * transport queue specified. It is the client's responsibility to ensure all
- * entries on queue are purged or otherwise handled appropriately.
- */
-static void
-ntb_transport_link_down(struct ntb_transport_qp *qp)
-{
- uint32_t val;
-
- if (qp == NULL)
- return;
-
- qp->client_ready = false;
-
- ntb_spad_read(qp->ntb, IF_NTB_QP_LINKS, &val);
-
- ntb_peer_spad_write(qp->ntb, IF_NTB_QP_LINKS,
- val & ~(1 << qp->qp_num));
-
- if (qp->link_is_up)
- ntb_send_link_down(qp);
- else
- callout_drain(&qp->link_work);
-}
-
-static void
-ntb_send_link_down(struct ntb_transport_qp *qp)
-{
- struct ntb_queue_entry *entry;
- int i, rc;
-
- if (!qp->link_is_up)
- return;
-
- for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
- entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
- if (entry != NULL)
+ m->m_pkthdr.rcvif = ifp;
+ if (sc->num_queues > 1) {
+ m->m_pkthdr.flowid = q - sc->queues;
+ M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
+ }
+ if (ifp->if_capenable & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
+ m_copydata(m, 12, 2, (void *)&proto);
+ switch (ntohs(proto)) {
+ case ETHERTYPE_IP:
+ if (ifp->if_capenable & IFCAP_RXCSUM) {
+ m->m_pkthdr.csum_data = 0xffff;
+ m->m_pkthdr.csum_flags = NTB_CSUM_SET;
+ }
+ break;
+ case ETHERTYPE_IPV6:
+ if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
+ m->m_pkthdr.csum_data = 0xffff;
+ m->m_pkthdr.csum_flags = NTB_CSUM_SET;
+ }
break;
- pause("NTB Wait for link down", hz / 10);
+ }
}
-
- if (entry == NULL)
- return;
-
- entry->cb_data = NULL;
- entry->buf = NULL;
- entry->len = 0;
- entry->flags = IF_NTB_LINK_DOWN_FLAG;
-
- mtx_lock(&qp->transport->tx_lock);
- rc = ntb_process_tx(qp, entry);
- if (rc != 0)
- printf("ntb: Failed to send link down\n");
- mtx_unlock(&qp->transport->tx_lock);
-
- ntb_qp_link_down_reset(qp);
+ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
+ ifp->if_input(ifp, m);
}
-
-/* List Management */
-
static void
-ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
- struct ntb_queue_list *list)
-{
-
- mtx_lock_spin(lock);
- STAILQ_INSERT_TAIL(list, entry, entry);
- mtx_unlock_spin(lock);
-}
-
-static struct ntb_queue_entry *
-ntb_list_rm(struct mtx *lock, struct ntb_queue_list *list)
-{
- struct ntb_queue_entry *entry;
-
- mtx_lock_spin(lock);
- if (STAILQ_EMPTY(list)) {
- entry = NULL;
- goto out;
- }
- entry = STAILQ_FIRST(list);
- STAILQ_REMOVE_HEAD(list, entry);
-out:
- mtx_unlock_spin(lock);
-
- return (entry);
-}
-
-static struct ntb_queue_entry *
-ntb_list_mv(struct mtx *lock, struct ntb_queue_list *from,
- struct ntb_queue_list *to)
+ntb_net_event_handler(void *data, enum ntb_link_event status)
{
- struct ntb_queue_entry *entry;
+ struct ntb_net_queue *q = data;
+ int new_state;
- mtx_lock_spin(lock);
- if (STAILQ_EMPTY(from)) {
- entry = NULL;
- goto out;
+ switch (status) {
+ case NTB_LINK_DOWN:
+ new_state = LINK_STATE_DOWN;
+ break;
+ case NTB_LINK_UP:
+ new_state = LINK_STATE_UP;
+ break;
+ default:
+ new_state = LINK_STATE_UNKNOWN;
+ break;
}
- entry = STAILQ_FIRST(from);
- STAILQ_REMOVE_HEAD(from, entry);
- STAILQ_INSERT_TAIL(to, entry, entry);
-
-out:
- mtx_unlock_spin(lock);
- return (entry);
+ if_link_state_change(q->ifp, new_state);
}
/* Helper functions */
@@ -1693,27 +493,24 @@ static void
create_random_local_eui48(u_char *eaddr)
{
static uint8_t counter = 0;
- uint32_t seed = ticks;
eaddr[0] = EUI48_LOCALLY_ADMINISTERED;
- memcpy(&eaddr[1], &seed, sizeof(uint32_t));
+ arc4rand(&eaddr[1], 4, 0);
eaddr[5] = counter++;
}
-/**
- * ntb_transport_max_size - Query the max payload size of a qp
- * @qp: NTB transport layer queue to be queried
- *
- * Query the maximum payload size permissible on the given qp
- *
- * RETURNS: the max payload size of a qp
- */
-static unsigned int
-ntb_transport_max_size(struct ntb_transport_qp *qp)
-{
-
- if (qp == NULL)
- return (0);
+static device_method_t ntb_net_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, ntb_net_probe),
+ DEVMETHOD(device_attach, ntb_net_attach),
+ DEVMETHOD(device_detach, ntb_net_detach),
+ DEVMETHOD_END
+};
- return (qp->tx_max_frame - sizeof(struct ntb_payload_header));
-}
+devclass_t ntb_net_devclass;
+static DEFINE_CLASS_0(ntb, ntb_net_driver, ntb_net_methods,
+ sizeof(struct ntb_net_ctx));
+DRIVER_MODULE(if_ntb, ntb_transport, ntb_net_driver, ntb_net_devclass,
+ NULL, NULL);
+MODULE_DEPEND(if_ntb, ntb_transport, 1, 1, 1);
+MODULE_VERSION(if_ntb, 1);
diff --git a/sys/dev/ntb/ntb.c b/sys/dev/ntb/ntb.c
new file mode 100644
index 0000000..1cf1ba2
--- /dev/null
+++ b/sys/dev/ntb/ntb.c
@@ -0,0 +1,463 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+
+#include "ntb.h"
+
+devclass_t ntb_hw_devclass;
+SYSCTL_NODE(_hw, OID_AUTO, ntb, CTLFLAG_RW, 0, "NTB sysctls");
+
+struct ntb_child {
+ device_t dev;
+ int enabled;
+ int mwoff;
+ int mwcnt;
+ int spadoff;
+ int spadcnt;
+ int dboff;
+ int dbmask;
+ void *ctx;
+ const struct ntb_ctx_ops *ctx_ops;
+ struct rmlock ctx_lock;
+ struct ntb_child *next;
+};
+
+int
+ntb_register_device(device_t dev)
+{
+ struct ntb_child **cpp = device_get_softc(dev);
+ struct ntb_child *nc;
+ int i, mw, mwu, mwt, spad, spadu, spadt, db, dbu, dbt;
+ char cfg[128] = "";
+ char buf[32];
+ char *n, *np, *c, *p, *name;
+
+ mwu = 0;
+ mwt = NTB_MW_COUNT(dev);
+ spadu = 0;
+ spadt = NTB_SPAD_COUNT(dev);
+ dbu = 0;
+ dbt = flsll(NTB_DB_VALID_MASK(dev));
+
+ device_printf(dev, "%d memory windows, %d scratchpads, "
+ "%d doorbells\n", mwt, spadt, dbt);
+
+ snprintf(buf, sizeof(buf), "hint.%s.%d.config", device_get_name(dev),
+ device_get_unit(dev));
+ TUNABLE_STR_FETCH(buf, cfg, sizeof(cfg));
+ n = cfg;
+ i = 0;
+ while ((c = strsep(&n, ",")) != NULL) {
+ np = c;
+ name = strsep(&np, ":");
+ if (name != NULL && name[0] == 0)
+ name = NULL;
+ p = strsep(&np, ":");
+ mw = (p && p[0] != 0) ? strtol(p, NULL, 10) : mwt - mwu;
+ p = strsep(&np, ":");
+ spad = (p && p[0] != 0) ? strtol(p, NULL, 10) : spadt - spadu;
+ db = (np && np[0] != 0) ? strtol(np, NULL, 10) : dbt - dbu;
+
+ if (mw > mwt - mwu || spad > spadt - spadu || db > dbt - dbu) {
+ device_printf(dev, "Not enough resources for config\n");
+ break;
+ }
+
+ nc = malloc(sizeof(*nc), M_DEVBUF, M_WAITOK | M_ZERO);
+ nc->mwoff = mwu;
+ nc->mwcnt = mw;
+ nc->spadoff = spadu;
+ nc->spadcnt = spad;
+ nc->dboff = dbu;
+ nc->dbmask = (db == 0) ? 0 : (0xffffffffffffffff >> (64 - db));
+ rm_init(&nc->ctx_lock, "ntb ctx");
+ nc->dev = device_add_child(dev, name, -1);
+ if (nc->dev == NULL) {
+ ntb_unregister_device(dev);
+ return (ENOMEM);
+ }
+ device_set_ivars(nc->dev, nc);
+ *cpp = nc;
+ cpp = &nc->next;
+
+ if (bootverbose) {
+ device_printf(dev, "%d \"%s\":", i, name);
+ if (mw > 0) {
+ printf(" memory windows %d", mwu);
+ if (mw > 1)
+ printf("-%d", mwu + mw - 1);
+ }
+ if (spad > 0) {
+ printf(" scratchpads %d", spadu);
+ if (spad > 1)
+ printf("-%d", spadu + spad - 1);
+ }
+ if (db > 0) {
+ printf(" doorbells %d", dbu);
+ if (db > 1)
+ printf("-%d", dbu + db - 1);
+ }
+ printf("\n");
+ }
+
+ mwu += mw;
+ spadu += spad;
+ dbu += db;
+ i++;
+ }
+
+ bus_generic_attach(dev);
+ return (0);
+}
+
+int
+ntb_unregister_device(device_t dev)
+{
+ struct ntb_child **cpp = device_get_softc(dev);
+ struct ntb_child *nc;
+ int error = 0;
+
+ while ((nc = *cpp) != NULL) {
+ *cpp = (*cpp)->next;
+ error = device_delete_child(dev, nc->dev);
+ if (error)
+ break;
+ rm_destroy(&nc->ctx_lock);
+ free(nc, M_DEVBUF);
+ }
+ return (error);
+}
+
+void
+ntb_link_event(device_t dev)
+{
+ struct ntb_child **cpp = device_get_softc(dev);
+ struct ntb_child *nc;
+ struct rm_priotracker ctx_tracker;
+
+ for (nc = *cpp; nc != NULL; nc = nc->next) {
+ rm_rlock(&nc->ctx_lock, &ctx_tracker);
+ if (nc->ctx_ops != NULL && nc->ctx_ops->link_event != NULL)
+ nc->ctx_ops->link_event(nc->ctx);
+ rm_runlock(&nc->ctx_lock, &ctx_tracker);
+ }
+}
+
+void
+ntb_db_event(device_t dev, uint32_t vec)
+{
+ struct ntb_child **cpp = device_get_softc(dev);
+ struct ntb_child *nc;
+ struct rm_priotracker ctx_tracker;
+
+ for (nc = *cpp; nc != NULL; nc = nc->next) {
+ rm_rlock(&nc->ctx_lock, &ctx_tracker);
+ if (nc->ctx_ops != NULL && nc->ctx_ops->db_event != NULL)
+ nc->ctx_ops->db_event(nc->ctx, vec);
+ rm_runlock(&nc->ctx_lock, &ctx_tracker);
+ }
+}
+
+bool
+ntb_link_is_up(device_t ntb, enum ntb_speed *speed, enum ntb_width *width)
+{
+
+ return (NTB_LINK_IS_UP(device_get_parent(ntb), speed, width));
+}
+
+int
+ntb_link_enable(device_t ntb, enum ntb_speed speed, enum ntb_width width)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+ struct ntb_child **cpp = device_get_softc(device_get_parent(nc->dev));
+ struct ntb_child *nc1;
+
+ for (nc1 = *cpp; nc1 != NULL; nc1 = nc1->next) {
+ if (nc1->enabled) {
+ nc->enabled = 1;
+ return (0);
+ }
+ }
+ nc->enabled = 1;
+ return (NTB_LINK_ENABLE(device_get_parent(ntb), speed, width));
+}
+
+int
+ntb_link_disable(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+ struct ntb_child **cpp = device_get_softc(device_get_parent(nc->dev));
+ struct ntb_child *nc1;
+
+ if (!nc->enabled)
+ return (0);
+ nc->enabled = 0;
+ for (nc1 = *cpp; nc1 != NULL; nc1 = nc1->next) {
+ if (nc1->enabled)
+ return (0);
+ }
+ return (NTB_LINK_DISABLE(device_get_parent(ntb)));
+}
+
+bool
+ntb_link_enabled(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (nc->enabled && NTB_LINK_ENABLED(device_get_parent(ntb)));
+}
+
+int
+ntb_set_ctx(device_t ntb, void *ctx, const struct ntb_ctx_ops *ctx_ops)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ if (ctx == NULL || ctx_ops == NULL)
+ return (EINVAL);
+
+ rm_wlock(&nc->ctx_lock);
+ if (nc->ctx_ops != NULL) {
+ rm_wunlock(&nc->ctx_lock);
+ return (EINVAL);
+ }
+ nc->ctx = ctx;
+ nc->ctx_ops = ctx_ops;
+ rm_wunlock(&nc->ctx_lock);
+
+ return (0);
+}
+
+void *
+ntb_get_ctx(device_t ntb, const struct ntb_ctx_ops **ctx_ops)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ KASSERT(nc->ctx != NULL && nc->ctx_ops != NULL, ("bogus"));
+ if (ctx_ops != NULL)
+ *ctx_ops = nc->ctx_ops;
+ return (nc->ctx);
+}
+
+void
+ntb_clear_ctx(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ rm_wlock(&nc->ctx_lock);
+ nc->ctx = NULL;
+ nc->ctx_ops = NULL;
+ rm_wunlock(&nc->ctx_lock);
+}
+
+uint8_t
+ntb_mw_count(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (nc->mwcnt);
+}
+
+int
+ntb_mw_get_range(device_t ntb, unsigned mw_idx, vm_paddr_t *base,
+ caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
+ bus_addr_t *plimit)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_MW_GET_RANGE(device_get_parent(ntb), mw_idx + nc->mwoff,
+ base, vbase, size, align, align_size, plimit));
+}
+
+int
+ntb_mw_set_trans(device_t ntb, unsigned mw_idx, bus_addr_t addr, size_t size)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_MW_SET_TRANS(device_get_parent(ntb), mw_idx + nc->mwoff,
+ addr, size));
+}
+
+int
+ntb_mw_clear_trans(device_t ntb, unsigned mw_idx)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_MW_CLEAR_TRANS(device_get_parent(ntb), mw_idx + nc->mwoff));
+}
+
+int
+ntb_mw_get_wc(device_t ntb, unsigned mw_idx, vm_memattr_t *mode)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_MW_GET_WC(device_get_parent(ntb), mw_idx + nc->mwoff, mode));
+}
+
+int
+ntb_mw_set_wc(device_t ntb, unsigned mw_idx, vm_memattr_t mode)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_MW_SET_WC(device_get_parent(ntb), mw_idx + nc->mwoff, mode));
+}
+
+uint8_t
+ntb_spad_count(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (nc->spadcnt);
+}
+
+void
+ntb_spad_clear(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+ unsigned i;
+
+ for (i = 0; i < nc->spadcnt; i++)
+ NTB_SPAD_WRITE(device_get_parent(ntb), i + nc->spadoff, 0);
+}
+
+int
+ntb_spad_write(device_t ntb, unsigned int idx, uint32_t val)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_SPAD_WRITE(device_get_parent(ntb), idx + nc->spadoff, val));
+}
+
+int
+ntb_spad_read(device_t ntb, unsigned int idx, uint32_t *val)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_SPAD_READ(device_get_parent(ntb), idx + nc->spadoff, val));
+}
+
+int
+ntb_peer_spad_write(device_t ntb, unsigned int idx, uint32_t val)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_PEER_SPAD_WRITE(device_get_parent(ntb), idx + nc->spadoff,
+ val));
+}
+
+int
+ntb_peer_spad_read(device_t ntb, unsigned int idx, uint32_t *val)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_PEER_SPAD_READ(device_get_parent(ntb), idx + nc->spadoff,
+ val));
+}
+
+uint64_t
+ntb_db_valid_mask(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (nc->dbmask);
+}
+
+int
+ntb_db_vector_count(device_t ntb)
+{
+
+ return (NTB_DB_VECTOR_COUNT(device_get_parent(ntb)));
+}
+
+uint64_t
+ntb_db_vector_mask(device_t ntb, uint32_t vector)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return ((NTB_DB_VECTOR_MASK(device_get_parent(ntb), vector)
+ >> nc->dboff) & nc->dbmask);
+}
+
+int
+ntb_peer_db_addr(device_t ntb, bus_addr_t *db_addr, vm_size_t *db_size)
+{
+
+ return (NTB_PEER_DB_ADDR(device_get_parent(ntb), db_addr, db_size));
+}
+
+void
+ntb_db_clear(device_t ntb, uint64_t bits)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_DB_CLEAR(device_get_parent(ntb), bits << nc->dboff));
+}
+
+void
+ntb_db_clear_mask(device_t ntb, uint64_t bits)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_DB_CLEAR_MASK(device_get_parent(ntb), bits << nc->dboff));
+}
+
+uint64_t
+ntb_db_read(device_t ntb)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return ((NTB_DB_READ(device_get_parent(ntb)) >> nc->dboff)
+ & nc->dbmask);
+}
+
+void
+ntb_db_set_mask(device_t ntb, uint64_t bits)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_DB_SET_MASK(device_get_parent(ntb), bits << nc->dboff));
+}
+
+void
+ntb_peer_db_set(device_t ntb, uint64_t bits)
+{
+ struct ntb_child *nc = device_get_ivars(ntb);
+
+ return (NTB_PEER_DB_SET(device_get_parent(ntb), bits << nc->dboff));
+}
+
+MODULE_VERSION(ntb, 1);
diff --git a/sys/dev/ntb/ntb.h b/sys/dev/ntb/ntb.h
new file mode 100644
index 0000000..8593c65
--- /dev/null
+++ b/sys/dev/ntb/ntb.h
@@ -0,0 +1,409 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NTB_H_
+#define _NTB_H_
+
+#include "ntb_if.h"
+
+extern devclass_t ntb_hw_devclass;
+SYSCTL_DECL(_hw_ntb);
+
+int ntb_register_device(device_t ntb);
+int ntb_unregister_device(device_t ntb);
+
+/*
+ * ntb_link_event() - notify driver context of a change in link status
+ * @ntb: NTB device context
+ *
+ * Notify the driver context that the link status may have changed. The driver
+ * should call intb_link_is_up() to get the current status.
+ */
+void ntb_link_event(device_t ntb);
+
+/*
+ * ntb_db_event() - notify driver context of a doorbell event
+ * @ntb: NTB device context
+ * @vector: Interrupt vector number
+ *
+ * Notify the driver context of a doorbell event. If hardware supports
+ * multiple interrupt vectors for doorbells, the vector number indicates which
+ * vector received the interrupt. The vector number is relative to the first
+ * vector used for doorbells, starting at zero, and must be less than
+ * ntb_db_vector_count(). The driver may call ntb_db_read() to check which
+ * doorbell bits need service, and ntb_db_vector_mask() to determine which of
+ * those bits are associated with the vector number.
+ */
+void ntb_db_event(device_t ntb, uint32_t vec);
+
+/*
+ * ntb_link_is_up() - get the current ntb link state
+ * @ntb: NTB device context
+ * @speed: OUT - The link speed expressed as PCIe generation number
+ * @width: OUT - The link width expressed as the number of PCIe lanes
+ *
+ * RETURNS: true or false based on the hardware link state
+ */
+bool ntb_link_is_up(device_t ntb, enum ntb_speed *speed, enum ntb_width *width);
+
+/*
+ * ntb_link_enable() - enable the link on the secondary side of the ntb
+ * @ntb: NTB device context
+ * @max_speed: The maximum link speed expressed as PCIe generation number[0]
+ * @max_width: The maximum link width expressed as the number of PCIe lanes[0]
+ *
+ * Enable the link on the secondary side of the ntb. This can only be done
+ * from the primary side of the ntb in primary or b2b topology. The ntb device
+ * should train the link to its maximum speed and width, or the requested speed
+ * and width, whichever is smaller, if supported.
+ *
+ * Return: Zero on success, otherwise an error number.
+ *
+ * [0]: Only NTB_SPEED_AUTO and NTB_WIDTH_AUTO are valid inputs; other speed
+ * and width input will be ignored.
+ */
+int ntb_link_enable(device_t ntb, enum ntb_speed speed, enum ntb_width width);
+
+/*
+ * ntb_link_disable() - disable the link on the secondary side of the ntb
+ * @ntb: NTB device context
+ *
+ * Disable the link on the secondary side of the ntb. This can only be done
+ * from the primary side of the ntb in primary or b2b topology. The ntb device
+ * should disable the link. Returning from this call must indicate that a
+ * barrier has passed, though with no more writes may pass in either direction
+ * across the link, except if this call returns an error number.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_link_disable(device_t ntb);
+
+/*
+ * get enable status of the link on the secondary side of the ntb
+ */
+bool ntb_link_enabled(device_t ntb);
+
+/*
+ * ntb_set_ctx() - associate a driver context with an ntb device
+ * @ntb: NTB device context
+ * @ctx: Driver context
+ * @ctx_ops: Driver context operations
+ *
+ * Associate a driver context and operations with a ntb device. The context is
+ * provided by the client driver, and the driver may associate a different
+ * context with each ntb device.
+ *
+ * Return: Zero if the context is associated, otherwise an error number.
+ */
+int ntb_set_ctx(device_t ntb, void *ctx, const struct ntb_ctx_ops *ctx_ops);
+
+/*
+ * ntb_set_ctx() - get a driver context associated with an ntb device
+ * @ntb: NTB device context
+ * @ctx_ops: Driver context operations
+ *
+ * Get a driver context and operations associated with a ntb device.
+ */
+void * ntb_get_ctx(device_t ntb, const struct ntb_ctx_ops **ctx_ops);
+
+/*
+ * ntb_clear_ctx() - disassociate any driver context from an ntb device
+ * @ntb: NTB device context
+ *
+ * Clear any association that may exist between a driver context and the ntb
+ * device.
+ */
+void ntb_clear_ctx(device_t ntb);
+
+/*
+ * ntb_mw_count() - Get the number of memory windows available for KPI
+ * consumers.
+ *
+ * (Excludes any MW wholly reserved for register access.)
+ */
+uint8_t ntb_mw_count(device_t ntb);
+
+/*
+ * ntb_mw_get_range() - get the range of a memory window
+ * @ntb: NTB device context
+ * @idx: Memory window number
+ * @base: OUT - the base address for mapping the memory window
+ * @size: OUT - the size for mapping the memory window
+ * @align: OUT - the base alignment for translating the memory window
+ * @align_size: OUT - the size alignment for translating the memory window
+ *
+ * Get the range of a memory window. NULL may be given for any output
+ * parameter if the value is not needed. The base and size may be used for
+ * mapping the memory window, to access the peer memory. The alignment and
+ * size may be used for translating the memory window, for the peer to access
+ * memory on the local system.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_get_range(device_t ntb, unsigned mw_idx, vm_paddr_t *base,
+ caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
+ bus_addr_t *plimit);
+
+/*
+ * ntb_mw_set_trans() - set the translation of a memory window
+ * @ntb: NTB device context
+ * @idx: Memory window number
+ * @addr: The dma address local memory to expose to the peer
+ * @size: The size of the local memory to expose to the peer
+ *
+ * Set the translation of a memory window. The peer may access local memory
+ * through the window starting at the address, up to the size. The address
+ * must be aligned to the alignment specified by ntb_mw_get_range(). The size
+ * must be aligned to the size alignment specified by ntb_mw_get_range(). The
+ * address must be below the plimit specified by ntb_mw_get_range() (i.e. for
+ * 32-bit BARs).
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_set_trans(device_t ntb, unsigned mw_idx, bus_addr_t addr,
+ size_t size);
+
+/*
+ * ntb_mw_clear_trans() - clear the translation of a memory window
+ * @ntb: NTB device context
+ * @idx: Memory window number
+ *
+ * Clear the translation of a memory window. The peer may no longer access
+ * local memory through the window.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int ntb_mw_clear_trans(device_t ntb, unsigned mw_idx);
+
+/*
+ * ntb_mw_get_wc - Get the write-combine status of a memory window
+ *
+ * Returns: Zero on success, setting *wc; otherwise an error number (e.g. if
+ * idx is an invalid memory window).
+ *
+ * Mode is a VM_MEMATTR_* type.
+ */
+int ntb_mw_get_wc(device_t ntb, unsigned mw_idx, vm_memattr_t *mode);
+
+/*
+ * ntb_mw_set_wc - Set the write-combine status of a memory window
+ *
+ * If 'mode' matches the current status, this does nothing and succeeds. Mode
+ * is a VM_MEMATTR_* type.
+ *
+ * Returns: Zero on success, setting the caching attribute on the virtual
+ * mapping of the BAR; otherwise an error number (e.g. if idx is an invalid
+ * memory window, or if changing the caching attribute fails).
+ */
+int ntb_mw_set_wc(device_t ntb, unsigned mw_idx, vm_memattr_t mode);
+
+/*
+ * ntb_spad_count() - get the total scratch regs usable
+ * @ntb: pointer to ntb_softc instance
+ *
+ * This function returns the max 32bit scratchpad registers usable by the
+ * upper layer.
+ *
+ * RETURNS: total number of scratch pad registers available
+ */
+uint8_t ntb_spad_count(device_t ntb);
+
+/*
+ * ntb_get_max_spads() - zero local scratch registers
+ * @ntb: pointer to ntb_softc instance
+ *
+ * This functions overwrites all local scratchpad registers with zeroes.
+ */
+void ntb_spad_clear(device_t ntb);
+
+/*
+ * ntb_spad_write() - write to the secondary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_spad_write(device_t ntb, unsigned int idx, uint32_t val);
+
+/*
+ * ntb_spad_read() - read from the primary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_spad_read(device_t ntb, unsigned int idx, uint32_t *val);
+
+/*
+ * ntb_peer_spad_write() - write to the secondary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_peer_spad_write(device_t ntb, unsigned int idx, uint32_t val);
+
+/*
+ * ntb_peer_spad_read() - read from the primary scratchpad register
+ * @ntb: pointer to ntb_softc instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int ntb_peer_spad_read(device_t ntb, unsigned int idx, uint32_t *val);
+
+/*
+ * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
+ * @ntb: NTB device context
+ *
+ * Hardware may support different number or arrangement of doorbell bits.
+ *
+ * Return: A mask of doorbell bits supported by the ntb.
+ */
+uint64_t ntb_db_valid_mask(device_t ntb);
+
+/*
+ * ntb_db_vector_count() - get the number of doorbell interrupt vectors
+ * @ntb: NTB device context.
+ *
+ * Hardware may support different number of interrupt vectors.
+ *
+ * Return: The number of doorbell interrupt vectors.
+ */
+int ntb_db_vector_count(device_t ntb);
+
+/*
+ * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
+ * @ntb: NTB device context
+ * @vector: Doorbell vector number
+ *
+ * Each interrupt vector may have a different number or arrangement of bits.
+ *
+ * Return: A mask of doorbell bits serviced by a vector.
+ */
+uint64_t ntb_db_vector_mask(device_t ntb, uint32_t vector);
+
+/*
+ * ntb_peer_db_addr() - address and size of the peer doorbell register
+ * @ntb: NTB device context.
+ * @db_addr: OUT - The address of the peer doorbell register.
+ * @db_size: OUT - The number of bytes to write the peer doorbell register.
+ *
+ * Return the address of the peer doorbell register. This may be used, for
+ * example, by drivers that offload memory copy operations to a dma engine.
+ * The drivers may wish to ring the peer doorbell at the completion of memory
+ * copy operations. For efficiency, and to simplify ordering of operations
+ * between the dma memory copies and the ringing doorbell, the driver may
+ * append one additional dma memory copy with the doorbell register as the
+ * destination, after the memory copy operations.
+ *
+ * Return: Zero on success, otherwise an error number.
+ *
+ * Note that writing the peer doorbell via a memory window will *not* generate
+ * an interrupt on the remote host; that must be done separately.
+ */
+int ntb_peer_db_addr(device_t ntb, bus_addr_t *db_addr, vm_size_t *db_size);
+
+/*
+ * ntb_db_clear() - clear bits in the local doorbell register
+ * @ntb: NTB device context.
+ * @db_bits: Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_clear(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_db_clear_mask() - clear bits in the local doorbell mask
+ * @ntb: NTB device context.
+ * @db_bits: Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell mask register, allowing doorbell interrupts
+ * from being generated for those doorbell bits. If a doorbell bit is already
+ * set at the time the mask is cleared, and the corresponding mask bit is
+ * changed from set to clear, then the ntb driver must ensure that
+ * ntb_db_event() is called. If the hardware does not generate the interrupt
+ * on clearing the mask bit, then the driver must call ntb_db_event() anyway.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_clear_mask(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_db_read() - read the local doorbell register
+ * @ntb: NTB device context.
+ *
+ * Read the local doorbell register, and return the bits that are set.
+ *
+ * Return: The bits currently set in the local doorbell register.
+ */
+uint64_t ntb_db_read(device_t ntb);
+
+/*
+ * ntb_db_set_mask() - set bits in the local doorbell mask
+ * @ntb: NTB device context.
+ * @db_bits: Doorbell mask bits to set.
+ *
+ * Set bits in the local doorbell mask register, preventing doorbell interrupts
+ * from being generated for those doorbell bits. Bits that were already set
+ * must remain set.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+void ntb_db_set_mask(device_t ntb, uint64_t bits);
+
+/*
+ * ntb_peer_db_set() - Set the doorbell on the secondary/external side
+ * @ntb: pointer to ntb_softc instance
+ * @bit: doorbell bits to ring
+ *
+ * This function allows triggering of a doorbell on the secondary/external
+ * side that will initiate an interrupt on the remote host
+ */
+void ntb_peer_db_set(device_t ntb, uint64_t bits);
+
+#endif /* _NTB_H_ */
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.c b/sys/dev/ntb/ntb_hw/ntb_hw.c
index b757f01..609aa4d 100644
--- a/sys/dev/ntb/ntb_hw/ntb_hw.c
+++ b/sys/dev/ntb/ntb_hw/ntb_hw.c
@@ -1,4 +1,5 @@
/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
* Copyright (C) 2013 Intel Corporation
* Copyright (C) 2015 EMC Corporation
* All rights reserved.
@@ -25,6 +26,16 @@
* SUCH DAMAGE.
*/
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a driver for NTB hardware in Intel Xeon/Atom CPUs.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
@@ -33,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/endian.h>
+#include <sys/interrupt.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
@@ -51,19 +63,7 @@ __FBSDID("$FreeBSD$");
#include <dev/pci/pcivar.h>
#include "ntb_regs.h"
-#include "ntb_hw.h"
-
-/*
- * The Non-Transparent Bridge (NTB) is a device on some Intel processors that
- * allows you to connect two systems using a PCI-e link.
- *
- * This module contains the hardware abstraction layer for the NTB. It allows
- * you to send and recieve interrupts, map the memory windows and send and
- * receive messages in the scratch-pad registers.
- *
- * NOTE: Much of the code in this module is shared with Linux. Any patches may
- * be picked up and redistributed in Linux with a dual GPL/BSD license.
- */
+#include "../ntb.h"
#define MAX_MSIX_INTERRUPTS MAX(XEON_DB_COUNT, ATOM_DB_COUNT)
@@ -71,8 +71,6 @@ __FBSDID("$FreeBSD$");
#define ATOM_LINK_RECOVERY_TIME 500 /* ms */
#define BAR_HIGH_MASK (~((1ull << 12) - 1))
-#define DEVICE2SOFTC(dev) ((struct ntb_softc *) device_get_softc(dev))
-
#define NTB_MSIX_VER_GUARD 0xaabbccdd
#define NTB_MSIX_RECEIVED 0xe0f0e0f0
@@ -123,8 +121,8 @@ enum {
};
/* Device features and workarounds */
-#define HAS_FEATURE(feature) \
- ((ntb->features & (feature)) != 0)
+#define HAS_FEATURE(ntb, feature) \
+ (((ntb)->features & (feature)) != 0)
struct ntb_hw_info {
uint32_t device_id;
@@ -203,6 +201,9 @@ struct ntb_msix_data {
};
struct ntb_softc {
+ /* ntb.c context. Do not move! Must go first! */
+ void *ntb_store;
+
device_t device;
enum ntb_device_type type;
uint32_t features;
@@ -221,13 +222,7 @@ struct ntb_softc {
struct callout heartbeat_timer;
struct callout lr_timer;
- void *ntb_ctx;
- const struct ntb_ctx_ops *ctx_ops;
struct ntb_vec *msix_vec;
-#define CTX_LOCK(sc) mtx_lock(&(sc)->ctx_lock)
-#define CTX_UNLOCK(sc) mtx_unlock(&(sc)->ctx_lock)
-#define CTX_ASSERT(sc,f) mtx_assert(&(sc)->ctx_lock, (f))
- struct mtx ctx_lock;
uint32_t ppd;
enum ntb_conn_type conn_type;
@@ -259,6 +254,7 @@ struct ntb_softc {
uint64_t db_valid_mask;
uint64_t db_link_mask;
uint64_t db_mask;
+ uint64_t fake_db_bell; /* NTB_SB01BASE_LOCKUP*/
int last_ts; /* ticks @ last irq */
@@ -288,61 +284,74 @@ bus_space_write_8(bus_space_tag_t tag, bus_space_handle_t handle,
}
#endif
-#define ntb_bar_read(SIZE, bar, offset) \
+#define intel_ntb_bar_read(SIZE, bar, offset) \
bus_space_read_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
ntb->bar_info[(bar)].pci_bus_handle, (offset))
-#define ntb_bar_write(SIZE, bar, offset, val) \
+#define intel_ntb_bar_write(SIZE, bar, offset, val) \
bus_space_write_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
ntb->bar_info[(bar)].pci_bus_handle, (offset), (val))
-#define ntb_reg_read(SIZE, offset) ntb_bar_read(SIZE, NTB_CONFIG_BAR, offset)
-#define ntb_reg_write(SIZE, offset, val) \
- ntb_bar_write(SIZE, NTB_CONFIG_BAR, offset, val)
-#define ntb_mw_read(SIZE, offset) \
- ntb_bar_read(SIZE, ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), offset)
-#define ntb_mw_write(SIZE, offset, val) \
- ntb_bar_write(SIZE, ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
+#define intel_ntb_reg_read(SIZE, offset) \
+ intel_ntb_bar_read(SIZE, NTB_CONFIG_BAR, offset)
+#define intel_ntb_reg_write(SIZE, offset, val) \
+ intel_ntb_bar_write(SIZE, NTB_CONFIG_BAR, offset, val)
+#define intel_ntb_mw_read(SIZE, offset) \
+ intel_ntb_bar_read(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
+ offset)
+#define intel_ntb_mw_write(SIZE, offset, val) \
+ intel_ntb_bar_write(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
offset, val)
-static int ntb_probe(device_t device);
-static int ntb_attach(device_t device);
-static int ntb_detach(device_t device);
-static unsigned ntb_user_mw_to_idx(struct ntb_softc *, unsigned uidx);
-static inline enum ntb_bar ntb_mw_to_bar(struct ntb_softc *, unsigned mw);
+static int intel_ntb_probe(device_t device);
+static int intel_ntb_attach(device_t device);
+static int intel_ntb_detach(device_t device);
+static uint64_t intel_ntb_db_valid_mask(device_t dev);
+static void intel_ntb_spad_clear(device_t dev);
+static uint64_t intel_ntb_db_vector_mask(device_t dev, uint32_t vector);
+static bool intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed,
+ enum ntb_width *width);
+static int intel_ntb_link_enable(device_t dev, enum ntb_speed speed,
+ enum ntb_width width);
+static int intel_ntb_link_disable(device_t dev);
+static int intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val);
+static int intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val);
+
+static unsigned intel_ntb_user_mw_to_idx(struct ntb_softc *, unsigned uidx);
+static inline enum ntb_bar intel_ntb_mw_to_bar(struct ntb_softc *, unsigned mw);
static inline bool bar_is_64bit(struct ntb_softc *, enum ntb_bar);
static inline void bar_get_xlat_params(struct ntb_softc *, enum ntb_bar,
uint32_t *base, uint32_t *xlat, uint32_t *lmt);
-static int ntb_map_pci_bars(struct ntb_softc *ntb);
-static int ntb_mw_set_wc_internal(struct ntb_softc *, unsigned idx,
+static int intel_ntb_map_pci_bars(struct ntb_softc *ntb);
+static int intel_ntb_mw_set_wc_internal(struct ntb_softc *, unsigned idx,
vm_memattr_t);
static void print_map_success(struct ntb_softc *, struct ntb_pci_bar_info *,
const char *);
static int map_mmr_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar);
static int map_memory_window_bar(struct ntb_softc *ntb,
struct ntb_pci_bar_info *bar);
-static void ntb_unmap_pci_bar(struct ntb_softc *ntb);
-static int ntb_remap_msix(device_t, uint32_t desired, uint32_t avail);
-static int ntb_init_isr(struct ntb_softc *ntb);
-static int ntb_setup_legacy_interrupt(struct ntb_softc *ntb);
-static int ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors);
-static void ntb_teardown_interrupts(struct ntb_softc *ntb);
-static inline uint64_t ntb_vec_mask(struct ntb_softc *, uint64_t db_vector);
-static void ntb_interrupt(struct ntb_softc *, uint32_t vec);
+static void intel_ntb_unmap_pci_bar(struct ntb_softc *ntb);
+static int intel_ntb_remap_msix(device_t, uint32_t desired, uint32_t avail);
+static int intel_ntb_init_isr(struct ntb_softc *ntb);
+static int intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb);
+static int intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors);
+static void intel_ntb_teardown_interrupts(struct ntb_softc *ntb);
+static inline uint64_t intel_ntb_vec_mask(struct ntb_softc *, uint64_t db_vector);
+static void intel_ntb_interrupt(struct ntb_softc *, uint32_t vec);
static void ndev_vec_isr(void *arg);
static void ndev_irq_isr(void *arg);
static inline uint64_t db_ioread(struct ntb_softc *, uint64_t regoff);
static inline void db_iowrite(struct ntb_softc *, uint64_t regoff, uint64_t);
static inline void db_iowrite_raw(struct ntb_softc *, uint64_t regoff, uint64_t);
-static int ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
-static void ntb_free_msix_vec(struct ntb_softc *ntb);
-static void ntb_get_msix_info(struct ntb_softc *ntb);
-static void ntb_exchange_msix(void *);
-static struct ntb_hw_info *ntb_get_device_info(uint32_t device_id);
-static void ntb_detect_max_mw(struct ntb_softc *ntb);
-static int ntb_detect_xeon(struct ntb_softc *ntb);
-static int ntb_detect_atom(struct ntb_softc *ntb);
-static int ntb_xeon_init_dev(struct ntb_softc *ntb);
-static int ntb_atom_init_dev(struct ntb_softc *ntb);
-static void ntb_teardown_xeon(struct ntb_softc *ntb);
+static int intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
+static void intel_ntb_free_msix_vec(struct ntb_softc *ntb);
+static void intel_ntb_get_msix_info(struct ntb_softc *ntb);
+static void intel_ntb_exchange_msix(void *);
+static struct ntb_hw_info *intel_ntb_get_device_info(uint32_t device_id);
+static void intel_ntb_detect_max_mw(struct ntb_softc *ntb);
+static int intel_ntb_detect_xeon(struct ntb_softc *ntb);
+static int intel_ntb_detect_atom(struct ntb_softc *ntb);
+static int intel_ntb_xeon_init_dev(struct ntb_softc *ntb);
+static int intel_ntb_atom_init_dev(struct ntb_softc *ntb);
+static void intel_ntb_teardown_xeon(struct ntb_softc *ntb);
static void configure_atom_secondary_side_bars(struct ntb_softc *ntb);
static void xeon_reset_sbar_size(struct ntb_softc *, enum ntb_bar idx,
enum ntb_bar regbar);
@@ -352,18 +361,16 @@ static void xeon_set_pbar_xlat(struct ntb_softc *, uint64_t base_addr,
enum ntb_bar idx);
static int xeon_setup_b2b_mw(struct ntb_softc *,
const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr);
-static int xeon_setup_msix_bar(struct ntb_softc *);
static inline bool link_is_up(struct ntb_softc *ntb);
static inline bool _xeon_link_is_up(struct ntb_softc *ntb);
static inline bool atom_link_is_err(struct ntb_softc *ntb);
-static inline enum ntb_speed ntb_link_sta_speed(struct ntb_softc *);
-static inline enum ntb_width ntb_link_sta_width(struct ntb_softc *);
+static inline enum ntb_speed intel_ntb_link_sta_speed(struct ntb_softc *);
+static inline enum ntb_width intel_ntb_link_sta_width(struct ntb_softc *);
static void atom_link_hb(void *arg);
-static void ntb_db_event(struct ntb_softc *ntb, uint32_t vec);
static void recover_atom_link(void *arg);
-static bool ntb_poll_link(struct ntb_softc *ntb);
+static bool intel_ntb_poll_link(struct ntb_softc *ntb);
static void save_bar_parameters(struct ntb_pci_bar_info *bar);
-static void ntb_sysctl_init(struct ntb_softc *);
+static void intel_ntb_sysctl_init(struct ntb_softc *);
static int sysctl_handle_features(SYSCTL_HANDLER_ARGS);
static int sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS);
static int sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS);
@@ -374,7 +381,7 @@ static unsigned g_ntb_hw_debug_level;
TUNABLE_INT("hw.ntb.debug_level", &g_ntb_hw_debug_level);
SYSCTL_UINT(_hw_ntb, OID_AUTO, debug_level, CTLFLAG_RWTUN,
&g_ntb_hw_debug_level, 0, "ntb_hw log level -- higher is more verbose");
-#define ntb_printf(lvl, ...) do { \
+#define intel_ntb_printf(lvl, ...) do { \
if ((lvl) <= g_ntb_hw_debug_level) { \
device_printf(ntb->device, __VA_ARGS__); \
} \
@@ -398,7 +405,7 @@ SYSCTL_UINT(_hw_ntb, OID_AUTO, default_mw_pat, CTLFLAG_RDTUN,
"UC-: " __XSTRING(_NTB_PAT_UCM));
static inline vm_memattr_t
-ntb_pat_flags(void)
+intel_ntb_pat_flags(void)
{
switch (g_ntb_mw_pat) {
@@ -424,7 +431,7 @@ ntb_pat_flags(void)
* anywhere better yet.
*/
static inline const char *
-ntb_vm_memattr_to_str(vm_memattr_t pat)
+intel_ntb_vm_memattr_to_str(vm_memattr_t pat)
{
switch (pat) {
@@ -445,7 +452,8 @@ ntb_vm_memattr_to_str(vm_memattr_t pat)
}
}
-static int g_ntb_msix_idx = 0;
+static int g_ntb_msix_idx = 1;
+TUNABLE_INT("hw.ntb.msix_mw_idx", &g_ntb_msix_idx);
SYSCTL_INT(_hw_ntb, OID_AUTO, msix_mw_idx, CTLFLAG_RDTUN, &g_ntb_msix_idx,
0, "Use this memory window to access the peer MSIX message complex on "
"certain Xeon-based NTB systems, as a workaround for a hardware errata. "
@@ -461,6 +469,18 @@ SYSCTL_INT(_hw_ntb, OID_AUTO, b2b_mw_idx, CTLFLAG_RDTUN, &g_ntb_mw_idx,
"available memory window. Both sides of the NTB MUST set the same "
"value here! (Applies on Xeon platforms with SDOORBELL_LOCKUP errata.)");
+/* Hardware owns the low 16 bits of features. */
+#define NTB_BAR_SIZE_4K (1 << 0)
+#define NTB_SDOORBELL_LOCKUP (1 << 1)
+#define NTB_SB01BASE_LOCKUP (1 << 2)
+#define NTB_B2BDOORBELL_BIT14 (1 << 3)
+/* Software/configuration owns the top 16 bits. */
+#define NTB_SPLIT_BAR (1ull << 16)
+
+#define NTB_FEATURES_STR \
+ "\20\21SPLIT_BAR4\04B2B_DOORBELL_BIT14\03SB01BASE_LOCKUP" \
+ "\02SDOORBELL_LOCKUP\01BAR_SIZE_4K"
+
static struct ntb_hw_info pci_ids[] = {
/* XXX: PS/SS IDs left out until they are supported. */
{ 0x0C4E8086, "BWD Atom Processor S1200 Non-Transparent Bridge B2B",
@@ -609,35 +629,15 @@ SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, dsd_bar5_addr32, CTLFLAG_RDTUN,
*/
MALLOC_DEFINE(M_NTB, "ntb_hw", "ntb_hw driver memory allocations");
-static device_method_t ntb_pci_methods[] = {
- /* Device interface */
- DEVMETHOD(device_probe, ntb_probe),
- DEVMETHOD(device_attach, ntb_attach),
- DEVMETHOD(device_detach, ntb_detach),
- DEVMETHOD_END
-};
-
-static driver_t ntb_pci_driver = {
- "ntb_hw",
- ntb_pci_methods,
- sizeof(struct ntb_softc),
-};
-
-static devclass_t ntb_devclass;
-DRIVER_MODULE(ntb_hw, pci, ntb_pci_driver, ntb_devclass, NULL, NULL);
-MODULE_VERSION(ntb_hw, 1);
-
-SYSCTL_NODE(_hw, OID_AUTO, ntb, CTLFLAG_RW, 0, "NTB sysctls");
-
/*
* OS <-> Driver linkage functions
*/
static int
-ntb_probe(device_t device)
+intel_ntb_probe(device_t device)
{
struct ntb_hw_info *p;
- p = ntb_get_device_info(pci_get_devid(device));
+ p = intel_ntb_get_device_info(pci_get_devid(device));
if (p == NULL)
return (ENXIO);
@@ -646,14 +646,14 @@ ntb_probe(device_t device)
}
static int
-ntb_attach(device_t device)
+intel_ntb_attach(device_t device)
{
struct ntb_softc *ntb;
struct ntb_hw_info *p;
int error;
- ntb = DEVICE2SOFTC(device);
- p = ntb_get_device_info(pci_get_devid(device));
+ ntb = device_get_softc(device);
+ p = intel_ntb_get_device_info(pci_get_devid(device));
ntb->device = device;
ntb->type = p->type;
@@ -666,47 +666,52 @@ ntb_attach(device_t device)
callout_init(&ntb->lr_timer, CALLOUT_MPSAFE);
callout_init(&ntb->peer_msix_work, 1);
mtx_init(&ntb->db_mask_lock, "ntb hw bits", NULL, MTX_SPIN);
- mtx_init(&ntb->ctx_lock, "ntb ctx", NULL, MTX_DEF);
if (ntb->type == NTB_ATOM)
- error = ntb_detect_atom(ntb);
+ error = intel_ntb_detect_atom(ntb);
else
- error = ntb_detect_xeon(ntb);
+ error = intel_ntb_detect_xeon(ntb);
if (error != 0)
goto out;
- ntb_detect_max_mw(ntb);
+ intel_ntb_detect_max_mw(ntb);
pci_enable_busmaster(ntb->device);
- error = ntb_map_pci_bars(ntb);
+ error = intel_ntb_map_pci_bars(ntb);
if (error != 0)
goto out;
if (ntb->type == NTB_ATOM)
- error = ntb_atom_init_dev(ntb);
+ error = intel_ntb_atom_init_dev(ntb);
else
- error = ntb_xeon_init_dev(ntb);
+ error = intel_ntb_xeon_init_dev(ntb);
if (error != 0)
goto out;
- ntb_spad_clear(ntb);
+ intel_ntb_spad_clear(device);
+
+ intel_ntb_poll_link(ntb);
- ntb_poll_link(ntb);
+ intel_ntb_sysctl_init(ntb);
- ntb_sysctl_init(ntb);
+ /* Attach children to this controller */
+ error = ntb_register_device(device);
out:
if (error != 0)
- ntb_detach(device);
+ intel_ntb_detach(device);
return (error);
}
static int
-ntb_detach(device_t device)
+intel_ntb_detach(device_t device)
{
struct ntb_softc *ntb;
- ntb = DEVICE2SOFTC(device);
+ ntb = device_get_softc(device);
+
+ /* Detach & delete all children */
+ ntb_unregister_device(device);
if (ntb->self_reg != NULL) {
DB_MASK_LOCK(ntb);
@@ -718,13 +723,12 @@ ntb_detach(device_t device)
callout_drain(&ntb->peer_msix_work);
pci_disable_busmaster(ntb->device);
if (ntb->type == NTB_XEON)
- ntb_teardown_xeon(ntb);
- ntb_teardown_interrupts(ntb);
+ intel_ntb_teardown_xeon(ntb);
+ intel_ntb_teardown_interrupts(ntb);
mtx_destroy(&ntb->db_mask_lock);
- mtx_destroy(&ntb->ctx_lock);
- ntb_unmap_pci_bar(ntb);
+ intel_ntb_unmap_pci_bar(ntb);
return (0);
}
@@ -733,7 +737,7 @@ ntb_detach(device_t device)
* Driver internal routines
*/
static inline enum ntb_bar
-ntb_mw_to_bar(struct ntb_softc *ntb, unsigned mw)
+intel_ntb_mw_to_bar(struct ntb_softc *ntb, unsigned mw)
{
KASSERT(mw < ntb->mw_count,
@@ -748,7 +752,7 @@ bar_is_64bit(struct ntb_softc *ntb, enum ntb_bar bar)
{
/* XXX This assertion could be stronger. */
KASSERT(bar < NTB_MAX_BARS, ("bogus bar"));
- return (bar < NTB_B2B_BAR_2 || !HAS_FEATURE(NTB_SPLIT_BAR));
+ return (bar < NTB_B2B_BAR_2 || !HAS_FEATURE(ntb, NTB_SPLIT_BAR));
}
static inline void
@@ -789,7 +793,7 @@ bar_get_xlat_params(struct ntb_softc *ntb, enum ntb_bar bar, uint32_t *base,
}
static int
-ntb_map_pci_bars(struct ntb_softc *ntb)
+intel_ntb_map_pci_bars(struct ntb_softc *ntb)
{
int rc;
@@ -814,7 +818,7 @@ ntb_map_pci_bars(struct ntb_softc *ntb)
ntb->bar_info[NTB_B2B_BAR_2].ssz_off = XEON_SBAR4SZ_OFFSET;
ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off = XEON_PBAR4XLAT_OFFSET;
- if (!HAS_FEATURE(NTB_SPLIT_BAR))
+ if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR))
goto out;
ntb->bar_info[NTB_B2B_BAR_3].pci_resource_id = PCIR_BAR(5);
@@ -888,7 +892,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
* but the PCI driver does not honor the size in this call, so we have
* to modify it after the fact.
*/
- if (HAS_FEATURE(NTB_BAR_SIZE_4K)) {
+ if (HAS_FEATURE(ntb, NTB_BAR_SIZE_4K)) {
if (bar->pci_resource_id == PCIR_BAR(2))
bar_size_bits = pci_read_config(ntb->device,
XEON_PBAR23SZ_OFFSET, 1);
@@ -915,7 +919,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
* Optionally, mark MW BARs as anything other than UC to improve
* performance.
*/
- mapmode = ntb_pat_flags();
+ mapmode = intel_ntb_pat_flags();
if (mapmode == bar->map_mode)
return (0);
@@ -928,7 +932,7 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
(char *)bar->vbase + bar->size - 1,
(void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
- ntb_vm_memattr_to_str(mapmode));
+ intel_ntb_vm_memattr_to_str(mapmode));
} else
device_printf(ntb->device,
"Unable to mark BAR%d v:[%p-%p] p:[%p-%p] as "
@@ -936,13 +940,13 @@ map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
(char *)bar->vbase + bar->size - 1,
(void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
- ntb_vm_memattr_to_str(mapmode), rc);
+ intel_ntb_vm_memattr_to_str(mapmode), rc);
/* Proceed anyway */
return (0);
}
static void
-ntb_unmap_pci_bar(struct ntb_softc *ntb)
+intel_ntb_unmap_pci_bar(struct ntb_softc *ntb)
{
struct ntb_pci_bar_info *current_bar;
int i;
@@ -957,7 +961,7 @@ ntb_unmap_pci_bar(struct ntb_softc *ntb)
}
static int
-ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors)
+intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors)
{
uint32_t i;
int rc;
@@ -1012,7 +1016,7 @@ SYSCTL_INT(_hw_ntb, OID_AUTO, prefer_intx_to_remap, CTLFLAG_RDTUN,
* round-robin fashion.
*/
static int
-ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
+intel_ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
{
u_int *vectors;
uint32_t i;
@@ -1032,7 +1036,7 @@ ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
}
static int
-ntb_init_isr(struct ntb_softc *ntb)
+intel_ntb_init_isr(struct ntb_softc *ntb)
{
uint32_t desired_vectors, num_vectors;
int rc;
@@ -1058,7 +1062,7 @@ ntb_init_isr(struct ntb_softc *ntb)
num_vectors--;
if (rc == 0 && num_vectors < desired_vectors) {
- rc = ntb_remap_msix(ntb->device, desired_vectors,
+ rc = intel_ntb_remap_msix(ntb->device, desired_vectors,
num_vectors);
if (rc == 0)
num_vectors = desired_vectors;
@@ -1071,7 +1075,7 @@ ntb_init_isr(struct ntb_softc *ntb)
num_vectors = 1;
if (ntb->type == NTB_XEON && num_vectors < ntb->db_vec_count) {
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
device_printf(ntb->device,
"Errata workaround does not support MSI or INTX\n");
return (EINVAL);
@@ -1079,32 +1083,30 @@ ntb_init_isr(struct ntb_softc *ntb)
ntb->db_vec_count = 1;
ntb->db_vec_shift = XEON_DB_TOTAL_SHIFT;
- rc = ntb_setup_legacy_interrupt(ntb);
+ rc = intel_ntb_setup_legacy_interrupt(ntb);
} else {
if (num_vectors - 1 != XEON_NONLINK_DB_MSIX_BITS &&
- HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
device_printf(ntb->device,
"Errata workaround expects %d doorbell bits\n",
XEON_NONLINK_DB_MSIX_BITS);
return (EINVAL);
}
- ntb_create_msix_vec(ntb, num_vectors);
- rc = ntb_setup_msix(ntb, num_vectors);
- if (rc == 0 && HAS_FEATURE(NTB_SB01BASE_LOCKUP))
- ntb_get_msix_info(ntb);
+ intel_ntb_create_msix_vec(ntb, num_vectors);
+ rc = intel_ntb_setup_msix(ntb, num_vectors);
}
if (rc != 0) {
device_printf(ntb->device,
"Error allocating interrupts: %d\n", rc);
- ntb_free_msix_vec(ntb);
+ intel_ntb_free_msix_vec(ntb);
}
return (rc);
}
static int
-ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
+intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
{
int rc;
@@ -1131,7 +1133,7 @@ ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
}
static void
-ntb_teardown_interrupts(struct ntb_softc *ntb)
+intel_ntb_teardown_interrupts(struct ntb_softc *ntb)
{
struct ntb_int_info *current_int;
int i;
@@ -1147,7 +1149,7 @@ ntb_teardown_interrupts(struct ntb_softc *ntb)
rman_get_rid(current_int->res), current_int->res);
}
- ntb_free_msix_vec(ntb);
+ intel_ntb_free_msix_vec(ntb);
pci_release_msi(ntb->device);
}
@@ -1160,11 +1162,11 @@ db_ioread(struct ntb_softc *ntb, uint64_t regoff)
{
if (ntb->type == NTB_ATOM)
- return (ntb_reg_read(8, regoff));
+ return (intel_ntb_reg_read(8, regoff));
KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
- return (ntb_reg_read(2, regoff));
+ return (intel_ntb_reg_read(2, regoff));
}
static inline void
@@ -1186,89 +1188,78 @@ db_iowrite_raw(struct ntb_softc *ntb, uint64_t regoff, uint64_t val)
{
if (ntb->type == NTB_ATOM) {
- ntb_reg_write(8, regoff, val);
+ intel_ntb_reg_write(8, regoff, val);
return;
}
KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
- ntb_reg_write(2, regoff, (uint16_t)val);
+ intel_ntb_reg_write(2, regoff, (uint16_t)val);
}
-void
-ntb_db_set_mask(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_set_mask(device_t dev, uint64_t bits)
{
-
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
- return;
+ struct ntb_softc *ntb = device_get_softc(dev);
DB_MASK_LOCK(ntb);
ntb->db_mask |= bits;
- db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+ if (!HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
+ db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
DB_MASK_UNLOCK(ntb);
}
-void
-ntb_db_clear_mask(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_clear_mask(device_t dev, uint64_t bits)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
+ uint64_t ibits;
+ int i;
KASSERT((bits & ~ntb->db_valid_mask) == 0,
("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
(uintmax_t)(bits & ~ntb->db_valid_mask),
(uintmax_t)ntb->db_valid_mask));
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
- return;
-
DB_MASK_LOCK(ntb);
+ ibits = ntb->fake_db_bell & ntb->db_mask & bits;
ntb->db_mask &= ~bits;
- db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+ /* Simulate fake interrupts if unmasked DB bits are set. */
+ for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+ if ((ibits & intel_ntb_db_vector_mask(dev, i)) != 0)
+ swi_sched(ntb->int_info[i].tag, 0);
+ }
+ } else {
+ db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+ }
DB_MASK_UNLOCK(ntb);
}
-uint64_t
-ntb_db_read(struct ntb_softc *ntb)
+static uint64_t
+intel_ntb_db_read(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
- uint64_t res;
- unsigned i;
-
- res = 0;
- for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
- if (ntb->msix_vec[i].masked != 0)
- res |= ntb_db_vector_mask(ntb, i);
- }
- return (res);
- }
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
+ return (ntb->fake_db_bell);
return (db_ioread(ntb, ntb->self_reg->db_bell));
}
-void
-ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
+static void
+intel_ntb_db_clear(device_t dev, uint64_t bits)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
KASSERT((bits & ~ntb->db_valid_mask) == 0,
("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
(uintmax_t)(bits & ~ntb->db_valid_mask),
(uintmax_t)ntb->db_valid_mask));
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
- unsigned i;
-
- for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
- if ((bits & ntb_db_vector_mask(ntb, i)) != 0) {
- DB_MASK_LOCK(ntb);
- if (ntb->msix_vec[i].masked != 0) {
- /* XXX These need a public API. */
-#if 0
- pci_unmask_msix(ntb->device, i);
-#endif
- ntb->msix_vec[i].masked = 0;
- }
- DB_MASK_UNLOCK(ntb);
- }
- }
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+ DB_MASK_LOCK(ntb);
+ ntb->fake_db_bell &= ~bits;
+ DB_MASK_UNLOCK(ntb);
return;
}
@@ -1276,43 +1267,59 @@ ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
}
static inline uint64_t
-ntb_vec_mask(struct ntb_softc *ntb, uint64_t db_vector)
+intel_ntb_vec_mask(struct ntb_softc *ntb, uint64_t db_vector)
{
uint64_t shift, mask;
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+ /*
+ * Remap vectors in custom way to make at least first
+ * three doorbells to not generate stray events.
+ * This breaks Linux compatibility (if one existed)
+ * when more then one DB is used (not by if_ntb).
+ */
+ if (db_vector < XEON_NONLINK_DB_MSIX_BITS - 1)
+ return (1 << db_vector);
+ if (db_vector == XEON_NONLINK_DB_MSIX_BITS - 1)
+ return (0x7ffc);
+ }
+
shift = ntb->db_vec_shift;
mask = (1ull << shift) - 1;
return (mask << (shift * db_vector));
}
static void
-ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
+intel_ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
{
uint64_t vec_mask;
ntb->last_ts = ticks;
- vec_mask = ntb_vec_mask(ntb, vec);
+ vec_mask = intel_ntb_vec_mask(ntb, vec);
if ((vec_mask & ntb->db_link_mask) != 0) {
- if (ntb_poll_link(ntb))
- ntb_link_event(ntb);
+ if (intel_ntb_poll_link(ntb))
+ ntb_link_event(ntb->device);
}
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
(vec_mask & ntb->db_link_mask) == 0) {
DB_MASK_LOCK(ntb);
- if (ntb->msix_vec[vec].masked == 0) {
- /* XXX These need a public API. */
-#if 0
- pci_mask_msix(ntb->device, vec);
-#endif
- ntb->msix_vec[vec].masked = 1;
- }
+
+ /* Do not report same DB events again if not cleared yet. */
+ vec_mask &= ~ntb->fake_db_bell;
+
+ /* Update our internal doorbell register. */
+ ntb->fake_db_bell |= vec_mask;
+
+ /* Do not report masked DB events. */
+ vec_mask &= ~ntb->db_mask;
+
DB_MASK_UNLOCK(ntb);
}
if ((vec_mask & ntb->db_valid_mask) != 0)
- ntb_db_event(ntb, vec);
+ ntb_db_event(ntb->device, vec);
}
static void
@@ -1320,18 +1327,18 @@ ndev_vec_isr(void *arg)
{
struct ntb_vec *nvec = arg;
- ntb_interrupt(nvec->ntb, nvec->num);
+ intel_ntb_interrupt(nvec->ntb, nvec->num);
}
static void
ndev_irq_isr(void *arg)
{
/* If we couldn't set up MSI-X, we only have the one vector. */
- ntb_interrupt(arg, 0);
+ intel_ntb_interrupt(arg, 0);
}
static int
-ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
+intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
{
uint32_t i;
@@ -1346,7 +1353,7 @@ ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
}
static void
-ntb_free_msix_vec(struct ntb_softc *ntb)
+intel_ntb_free_msix_vec(struct ntb_softc *ntb)
{
if (ntb->msix_vec == NULL)
@@ -1357,7 +1364,7 @@ ntb_free_msix_vec(struct ntb_softc *ntb)
}
static void
-ntb_get_msix_info(struct ntb_softc *ntb)
+intel_ntb_get_msix_info(struct ntb_softc *ntb)
{
struct pci_devinfo *dinfo;
struct pcicfg_msix *msix;
@@ -1366,8 +1373,6 @@ ntb_get_msix_info(struct ntb_softc *ntb)
dinfo = device_get_ivars(ntb->device);
msix = &dinfo->cfg.msix;
- laddr = data = 0;
-
CTASSERT(XEON_NONLINK_DB_MSIX_BITS == nitems(ntb->msix_data));
for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
@@ -1375,7 +1380,7 @@ ntb_get_msix_info(struct ntb_softc *ntb)
laddr = bus_read_4(msix->msix_table_res, offset +
PCI_MSIX_ENTRY_LOWER_ADDR);
- ntb_printf(2, "local MSIX addr(%u): 0x%x\n", i, laddr);
+ intel_ntb_printf(2, "local MSIX addr(%u): 0x%x\n", i, laddr);
KASSERT((laddr & MSI_INTEL_ADDR_BASE) == MSI_INTEL_ADDR_BASE,
("local MSIX addr 0x%x not in MSI base 0x%x", laddr,
@@ -1384,14 +1389,14 @@ ntb_get_msix_info(struct ntb_softc *ntb)
data = bus_read_4(msix->msix_table_res, offset +
PCI_MSIX_ENTRY_DATA);
- ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
+ intel_ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
ntb->msix_data[i].nmd_data = data;
}
}
static struct ntb_hw_info *
-ntb_get_device_info(uint32_t device_id)
+intel_ntb_get_device_info(uint32_t device_id)
{
struct ntb_hw_info *ep = pci_ids;
@@ -1404,15 +1409,15 @@ ntb_get_device_info(uint32_t device_id)
}
static void
-ntb_teardown_xeon(struct ntb_softc *ntb)
+intel_ntb_teardown_xeon(struct ntb_softc *ntb)
{
if (ntb->reg != NULL)
- ntb_link_disable(ntb);
+ intel_ntb_link_disable(ntb->device);
}
static void
-ntb_detect_max_mw(struct ntb_softc *ntb)
+intel_ntb_detect_max_mw(struct ntb_softc *ntb)
{
if (ntb->type == NTB_ATOM) {
@@ -1420,14 +1425,14 @@ ntb_detect_max_mw(struct ntb_softc *ntb)
return;
}
- if (HAS_FEATURE(NTB_SPLIT_BAR))
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
ntb->mw_count = XEON_HSX_SPLIT_MW_COUNT;
else
ntb->mw_count = XEON_SNB_MW_COUNT;
}
static int
-ntb_detect_xeon(struct ntb_softc *ntb)
+intel_ntb_detect_xeon(struct ntb_softc *ntb)
{
uint8_t ppd, conn_type;
@@ -1442,11 +1447,21 @@ ntb_detect_xeon(struct ntb_softc *ntb)
if ((ppd & XEON_PPD_SPLIT_BAR) != 0)
ntb->features |= NTB_SPLIT_BAR;
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
+ !HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
+ device_printf(ntb->device,
+ "Can not apply SB01BASE_LOCKUP workaround "
+ "with split BARs disabled!\n");
+ device_printf(ntb->device,
+ "Expect system hangs under heavy NTB traffic!\n");
+ ntb->features &= ~NTB_SB01BASE_LOCKUP;
+ }
+
/*
* SDOORBELL errata workaround gets in the way of SB01BASE_LOCKUP
* errata workaround; only do one at a time.
*/
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
ntb->features &= ~NTB_SDOORBELL_LOCKUP;
conn_type = ppd & XEON_PPD_CONN_TYPE;
@@ -1465,7 +1480,7 @@ ntb_detect_xeon(struct ntb_softc *ntb)
}
static int
-ntb_detect_atom(struct ntb_softc *ntb)
+intel_ntb_detect_atom(struct ntb_softc *ntb)
{
uint32_t ppd, conn_type;
@@ -1490,7 +1505,7 @@ ntb_detect_atom(struct ntb_softc *ntb)
}
static int
-ntb_xeon_init_dev(struct ntb_softc *ntb)
+intel_ntb_xeon_init_dev(struct ntb_softc *ntb)
{
int rc;
@@ -1511,15 +1526,16 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
ntb->peer_reg = &xeon_b2b_reg;
ntb->xlat_reg = &xeon_sec_xlat;
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+ ntb->fake_db_bell = 0;
ntb->msix_mw_idx = (ntb->mw_count + g_ntb_msix_idx) %
ntb->mw_count;
- ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
+ intel_ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
g_ntb_msix_idx, ntb->msix_mw_idx);
- rc = ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
+ rc = intel_ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
VM_MEMATTR_UNCACHEABLE);
KASSERT(rc == 0, ("shouldn't fail"));
- } else if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+ } else if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
/*
* There is a Xeon hardware errata related to writes to SDOORBELL or
* B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
@@ -1529,12 +1545,12 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
*/
ntb->b2b_mw_idx = (ntb->mw_count + g_ntb_mw_idx) %
ntb->mw_count;
- ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
+ intel_ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
g_ntb_mw_idx, ntb->b2b_mw_idx);
- rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
+ rc = intel_ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
VM_MEMATTR_UNCACHEABLE);
KASSERT(rc == 0, ("shouldn't fail"));
- } else if (HAS_FEATURE(NTB_B2BDOORBELL_BIT14))
+ } else if (HAS_FEATURE(ntb, NTB_B2BDOORBELL_BIT14))
/*
* HW Errata on bit 14 of b2bdoorbell register. Writes will not be
* mirrored to the remote system. Shrink the number of bits by one,
@@ -1557,7 +1573,7 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
return (rc);
/* Enable Bus Master and Memory Space on the secondary side */
- ntb_reg_write(2, XEON_SPCICMD_OFFSET,
+ intel_ntb_reg_write(2, XEON_SPCICMD_OFFSET,
PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
/*
@@ -1568,16 +1584,12 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
DB_MASK_UNLOCK(ntb);
- rc = xeon_setup_msix_bar(ntb);
- if (rc != 0)
- return (rc);
-
- rc = ntb_init_isr(ntb);
+ rc = intel_ntb_init_isr(ntb);
return (rc);
}
static int
-ntb_atom_init_dev(struct ntb_softc *ntb)
+intel_ntb_atom_init_dev(struct ntb_softc *ntb)
{
int error;
@@ -1604,15 +1616,15 @@ ntb_atom_init_dev(struct ntb_softc *ntb)
configure_atom_secondary_side_bars(ntb);
/* Enable Bus Master and Memory Space on the secondary side */
- ntb_reg_write(2, ATOM_SPCICMD_OFFSET,
+ intel_ntb_reg_write(2, ATOM_SPCICMD_OFFSET,
PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
- error = ntb_init_isr(ntb);
+ error = intel_ntb_init_isr(ntb);
if (error != 0)
return (error);
/* Initiate PCI-E link training */
- ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+ intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
callout_reset(&ntb->heartbeat_timer, 0, atom_link_hb, ntb);
@@ -1625,19 +1637,19 @@ configure_atom_secondary_side_bars(struct ntb_softc *ntb)
{
if (ntb->dev_type == NTB_DEV_USD) {
- ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
+ intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
XEON_B2B_BAR2_ADDR64);
- ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
+ intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
XEON_B2B_BAR4_ADDR64);
- ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
- ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
+ intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
+ intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
} else {
- ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
+ intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
XEON_B2B_BAR2_ADDR64);
- ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
+ intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
XEON_B2B_BAR4_ADDR64);
- ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
- ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
+ intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
+ intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
}
}
@@ -1664,7 +1676,7 @@ xeon_reset_sbar_size(struct ntb_softc *ntb, enum ntb_bar idx,
struct ntb_pci_bar_info *bar;
uint8_t bar_sz;
- if (!HAS_FEATURE(NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_3)
+ if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_3)
return;
bar = &ntb->bar_info[idx];
@@ -1688,28 +1700,28 @@ xeon_set_sbar_base_and_limit(struct ntb_softc *ntb, uint64_t bar_addr,
uint32_t base_reg, lmt_reg;
bar_get_xlat_params(ntb, idx, &base_reg, NULL, &lmt_reg);
- if (idx == regbar)
- bar_addr += ntb->b2b_off;
+ if (idx == regbar) {
+ if (ntb->b2b_off)
+ bar_addr += ntb->b2b_off;
+ else
+ bar_addr = 0;
+ }
- /*
- * Set limit registers first to avoid an errata where setting the base
- * registers locks the limit registers.
- */
if (!bar_is_64bit(ntb, idx)) {
- ntb_reg_write(4, lmt_reg, bar_addr);
- reg_val = ntb_reg_read(4, lmt_reg);
+ intel_ntb_reg_write(4, base_reg, bar_addr);
+ reg_val = intel_ntb_reg_read(4, base_reg);
(void)reg_val;
- ntb_reg_write(4, base_reg, bar_addr);
- reg_val = ntb_reg_read(4, base_reg);
+ intel_ntb_reg_write(4, lmt_reg, bar_addr);
+ reg_val = intel_ntb_reg_read(4, lmt_reg);
(void)reg_val;
} else {
- ntb_reg_write(8, lmt_reg, bar_addr);
- reg_val = ntb_reg_read(8, lmt_reg);
+ intel_ntb_reg_write(8, base_reg, bar_addr);
+ reg_val = intel_ntb_reg_read(8, base_reg);
(void)reg_val;
- ntb_reg_write(8, base_reg, bar_addr);
- reg_val = ntb_reg_read(8, base_reg);
+ intel_ntb_reg_write(8, lmt_reg, bar_addr);
+ reg_val = intel_ntb_reg_read(8, lmt_reg);
(void)reg_val;
}
}
@@ -1720,30 +1732,17 @@ xeon_set_pbar_xlat(struct ntb_softc *ntb, uint64_t base_addr, enum ntb_bar idx)
struct ntb_pci_bar_info *bar;
bar = &ntb->bar_info[idx];
- if (HAS_FEATURE(NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_2) {
- ntb_reg_write(4, bar->pbarxlat_off, base_addr);
- base_addr = ntb_reg_read(4, bar->pbarxlat_off);
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_2) {
+ intel_ntb_reg_write(4, bar->pbarxlat_off, base_addr);
+ base_addr = intel_ntb_reg_read(4, bar->pbarxlat_off);
} else {
- ntb_reg_write(8, bar->pbarxlat_off, base_addr);
- base_addr = ntb_reg_read(8, bar->pbarxlat_off);
+ intel_ntb_reg_write(8, bar->pbarxlat_off, base_addr);
+ base_addr = intel_ntb_reg_read(8, bar->pbarxlat_off);
}
(void)base_addr;
}
static int
-xeon_setup_msix_bar(struct ntb_softc *ntb)
-{
- enum ntb_bar bar_num;
-
- if (!HAS_FEATURE(NTB_SB01BASE_LOCKUP))
- return (0);
-
- bar_num = ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
- ntb->peer_lapic_bar = &ntb->bar_info[bar_num];
- return (0);
-}
-
-static int
xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
const struct ntb_b2b_addr *peer_addr)
{
@@ -1757,7 +1756,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
b2b_bar_num = NTB_CONFIG_BAR;
ntb->b2b_off = 0;
} else {
- b2b_bar_num = ntb_mw_to_bar(ntb, ntb->b2b_mw_idx);
+ b2b_bar_num = intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx);
KASSERT(b2b_bar_num > 0 && b2b_bar_num < NTB_MAX_BARS,
("invalid b2b mw bar"));
@@ -1788,7 +1787,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
bar_addr = addr->bar0_addr;
else if (b2b_bar_num == NTB_B2B_BAR_1)
bar_addr = addr->bar2_addr64;
- else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(NTB_SPLIT_BAR))
+ else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
bar_addr = addr->bar4_addr64;
else if (b2b_bar_num == NTB_B2B_BAR_2)
bar_addr = addr->bar4_addr32;
@@ -1797,7 +1796,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
else
KASSERT(false, ("invalid bar"));
- ntb_reg_write(8, XEON_SBAR0BASE_OFFSET, bar_addr);
+ intel_ntb_reg_write(8, XEON_SBAR0BASE_OFFSET, bar_addr);
/*
* Other SBARs are normally hit by the PBAR xlat, except for the b2b
@@ -1808,7 +1807,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
*/
xeon_set_sbar_base_and_limit(ntb, addr->bar2_addr64, NTB_B2B_BAR_1,
b2b_bar_num);
- if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
xeon_set_sbar_base_and_limit(ntb, addr->bar4_addr32,
NTB_B2B_BAR_2, b2b_bar_num);
xeon_set_sbar_base_and_limit(ntb, addr->bar5_addr32,
@@ -1818,56 +1817,41 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
NTB_B2B_BAR_2, b2b_bar_num);
/* Zero incoming translation addrs */
- ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
- ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
-
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
- size_t size, xlatoffset;
+ intel_ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
+ intel_ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
- switch (ntb_mw_to_bar(ntb, ntb->msix_mw_idx)) {
- case NTB_B2B_BAR_1:
- size = 8;
- xlatoffset = XEON_SBAR2XLAT_OFFSET;
- break;
- case NTB_B2B_BAR_2:
- xlatoffset = XEON_SBAR4XLAT_OFFSET;
- if (HAS_FEATURE(NTB_SPLIT_BAR))
- size = 4;
- else
- size = 8;
- break;
- case NTB_B2B_BAR_3:
- xlatoffset = XEON_SBAR5XLAT_OFFSET;
- size = 4;
- break;
- default:
- KASSERT(false, ("Bogus msix mw idx: %u",
- ntb->msix_mw_idx));
- return (EINVAL);
- }
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
+ uint32_t xlat_reg, lmt_reg;
+ enum ntb_bar bar_num;
/*
* We point the chosen MSIX MW BAR xlat to remote LAPIC for
* workaround
*/
- if (size == 4) {
- ntb_reg_write(4, xlatoffset, MSI_INTEL_ADDR_BASE);
- ntb->msix_xlat = ntb_reg_read(4, xlatoffset);
+ bar_num = intel_ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
+ bar_get_xlat_params(ntb, bar_num, NULL, &xlat_reg, &lmt_reg);
+ if (bar_is_64bit(ntb, bar_num)) {
+ intel_ntb_reg_write(8, xlat_reg, MSI_INTEL_ADDR_BASE);
+ ntb->msix_xlat = intel_ntb_reg_read(8, xlat_reg);
+ intel_ntb_reg_write(8, lmt_reg, 0);
} else {
- ntb_reg_write(8, xlatoffset, MSI_INTEL_ADDR_BASE);
- ntb->msix_xlat = ntb_reg_read(8, xlatoffset);
+ intel_ntb_reg_write(4, xlat_reg, MSI_INTEL_ADDR_BASE);
+ ntb->msix_xlat = intel_ntb_reg_read(4, xlat_reg);
+ intel_ntb_reg_write(4, lmt_reg, 0);
}
+
+ ntb->peer_lapic_bar = &ntb->bar_info[bar_num];
}
- (void)ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
- (void)ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
+ (void)intel_ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
+ (void)intel_ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
/* Zero outgoing translation limits (whole bar size windows) */
- ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
- ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
+ intel_ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
+ intel_ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
/* Set outgoing translation offsets */
xeon_set_pbar_xlat(ntb, peer_addr->bar2_addr64, NTB_B2B_BAR_1);
- if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
xeon_set_pbar_xlat(ntb, peer_addr->bar4_addr32, NTB_B2B_BAR_2);
xeon_set_pbar_xlat(ntb, peer_addr->bar5_addr32, NTB_B2B_BAR_3);
} else
@@ -1879,7 +1863,7 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
bar_addr = peer_addr->bar0_addr;
else if (b2b_bar_num == NTB_B2B_BAR_1)
bar_addr = peer_addr->bar2_addr64;
- else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(NTB_SPLIT_BAR))
+ else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
bar_addr = peer_addr->bar4_addr64;
else if (b2b_bar_num == NTB_B2B_BAR_2)
bar_addr = peer_addr->bar4_addr32;
@@ -1892,8 +1876,8 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
* B2B_XLAT_OFFSET is a 64-bit register but can only be written 32 bits
* at a time.
*/
- ntb_reg_write(4, XEON_B2B_XLAT_OFFSETL, bar_addr & 0xffffffff);
- ntb_reg_write(4, XEON_B2B_XLAT_OFFSETU, bar_addr >> 32);
+ intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETL, bar_addr & 0xffffffff);
+ intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETU, bar_addr >> 32);
return (0);
}
@@ -1912,7 +1896,7 @@ link_is_up(struct ntb_softc *ntb)
if (ntb->type == NTB_XEON)
return (_xeon_link_is_up(ntb) && (ntb->peer_msix_good ||
- !HAS_FEATURE(NTB_SB01BASE_LOCKUP)));
+ !HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)));
KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
return ((ntb->ntb_ctl & ATOM_CNTL_LINK_DOWN) == 0);
@@ -1925,11 +1909,11 @@ atom_link_is_err(struct ntb_softc *ntb)
KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
- status = ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
+ status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
if ((status & ATOM_LTSSMSTATEJMP_FORCEDETECT) != 0)
return (true);
- status = ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
+ status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
return ((status & ATOM_IBIST_ERR_OFLOW) != 0);
}
@@ -1952,8 +1936,8 @@ atom_link_hb(void *arg)
goto out;
}
- if (ntb_poll_link(ntb))
- ntb_link_event(ntb);
+ if (intel_ntb_poll_link(ntb))
+ ntb_link_event(ntb->device);
if (!link_is_up(ntb) && atom_link_is_err(ntb)) {
/* Link is down with error, proceed with recovery */
@@ -1971,166 +1955,47 @@ atom_perform_link_restart(struct ntb_softc *ntb)
uint32_t status;
/* Driver resets the NTB ModPhy lanes - magic! */
- ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0xe0);
- ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x40);
- ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x60);
- ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0x60);
+ intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0xe0);
+ intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x40);
+ intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x60);
+ intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0x60);
/* Driver waits 100ms to allow the NTB ModPhy to settle */
pause("ModPhy", hz / 10);
/* Clear AER Errors, write to clear */
- status = ntb_reg_read(4, ATOM_ERRCORSTS_OFFSET);
+ status = intel_ntb_reg_read(4, ATOM_ERRCORSTS_OFFSET);
status &= PCIM_AER_COR_REPLAY_ROLLOVER;
- ntb_reg_write(4, ATOM_ERRCORSTS_OFFSET, status);
+ intel_ntb_reg_write(4, ATOM_ERRCORSTS_OFFSET, status);
/* Clear unexpected electrical idle event in LTSSM, write to clear */
- status = ntb_reg_read(4, ATOM_LTSSMERRSTS0_OFFSET);
+ status = intel_ntb_reg_read(4, ATOM_LTSSMERRSTS0_OFFSET);
status |= ATOM_LTSSMERRSTS0_UNEXPECTEDEI;
- ntb_reg_write(4, ATOM_LTSSMERRSTS0_OFFSET, status);
+ intel_ntb_reg_write(4, ATOM_LTSSMERRSTS0_OFFSET, status);
/* Clear DeSkew Buffer error, write to clear */
- status = ntb_reg_read(4, ATOM_DESKEWSTS_OFFSET);
+ status = intel_ntb_reg_read(4, ATOM_DESKEWSTS_OFFSET);
status |= ATOM_DESKEWSTS_DBERR;
- ntb_reg_write(4, ATOM_DESKEWSTS_OFFSET, status);
+ intel_ntb_reg_write(4, ATOM_DESKEWSTS_OFFSET, status);
- status = ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
+ status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
status &= ATOM_IBIST_ERR_OFLOW;
- ntb_reg_write(4, ATOM_IBSTERRRCRVSTS0_OFFSET, status);
+ intel_ntb_reg_write(4, ATOM_IBSTERRRCRVSTS0_OFFSET, status);
/* Releases the NTB state machine to allow the link to retrain */
- status = ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
+ status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
status &= ~ATOM_LTSSMSTATEJMP_FORCEDETECT;
- ntb_reg_write(4, ATOM_LTSSMSTATEJMP_OFFSET, status);
-}
-
-/*
- * ntb_set_ctx() - associate a driver context with an ntb device
- * @ntb: NTB device context
- * @ctx: Driver context
- * @ctx_ops: Driver context operations
- *
- * Associate a driver context and operations with a ntb device. The context is
- * provided by the client driver, and the driver may associate a different
- * context with each ntb device.
- *
- * Return: Zero if the context is associated, otherwise an error number.
- */
-int
-ntb_set_ctx(struct ntb_softc *ntb, void *ctx, const struct ntb_ctx_ops *ops)
-{
-
- if (ctx == NULL || ops == NULL)
- return (EINVAL);
- if (ntb->ctx_ops != NULL)
- return (EINVAL);
-
- CTX_LOCK(ntb);
- if (ntb->ctx_ops != NULL) {
- CTX_UNLOCK(ntb);
- return (EINVAL);
- }
- ntb->ntb_ctx = ctx;
- ntb->ctx_ops = ops;
- CTX_UNLOCK(ntb);
-
- return (0);
-}
-
-/*
- * It is expected that this will only be used from contexts where the ctx_lock
- * is not needed to protect ntb_ctx lifetime.
- */
-void *
-ntb_get_ctx(struct ntb_softc *ntb, const struct ntb_ctx_ops **ops)
-{
-
- KASSERT(ntb->ntb_ctx != NULL && ntb->ctx_ops != NULL, ("bogus"));
- if (ops != NULL)
- *ops = ntb->ctx_ops;
- return (ntb->ntb_ctx);
-}
-
-/*
- * ntb_clear_ctx() - disassociate any driver context from an ntb device
- * @ntb: NTB device context
- *
- * Clear any association that may exist between a driver context and the ntb
- * device.
- */
-void
-ntb_clear_ctx(struct ntb_softc *ntb)
-{
-
- CTX_LOCK(ntb);
- ntb->ntb_ctx = NULL;
- ntb->ctx_ops = NULL;
- CTX_UNLOCK(ntb);
-}
-
-/*
- * ntb_link_event() - notify driver context of a change in link status
- * @ntb: NTB device context
- *
- * Notify the driver context that the link status may have changed. The driver
- * should call ntb_link_is_up() to get the current status.
- */
-void
-ntb_link_event(struct ntb_softc *ntb)
-{
-
- CTX_LOCK(ntb);
- if (ntb->ctx_ops != NULL && ntb->ctx_ops->link_event != NULL)
- ntb->ctx_ops->link_event(ntb->ntb_ctx);
- CTX_UNLOCK(ntb);
+ intel_ntb_reg_write(4, ATOM_LTSSMSTATEJMP_OFFSET, status);
}
-/*
- * ntb_db_event() - notify driver context of a doorbell event
- * @ntb: NTB device context
- * @vector: Interrupt vector number
- *
- * Notify the driver context of a doorbell event. If hardware supports
- * multiple interrupt vectors for doorbells, the vector number indicates which
- * vector received the interrupt. The vector number is relative to the first
- * vector used for doorbells, starting at zero, and must be less than
- * ntb_db_vector_count(). The driver may call ntb_db_read() to check which
- * doorbell bits need service, and ntb_db_vector_mask() to determine which of
- * those bits are associated with the vector number.
- */
-static void
-ntb_db_event(struct ntb_softc *ntb, uint32_t vec)
-{
-
- CTX_LOCK(ntb);
- if (ntb->ctx_ops != NULL && ntb->ctx_ops->db_event != NULL)
- ntb->ctx_ops->db_event(ntb->ntb_ctx, vec);
- CTX_UNLOCK(ntb);
-}
-
-/*
- * ntb_link_enable() - enable the link on the secondary side of the ntb
- * @ntb: NTB device context
- * @max_speed: The maximum link speed expressed as PCIe generation number[0]
- * @max_width: The maximum link width expressed as the number of PCIe lanes[0]
- *
- * Enable the link on the secondary side of the ntb. This can only be done
- * from the primary side of the ntb in primary or b2b topology. The ntb device
- * should train the link to its maximum speed and width, or the requested speed
- * and width, whichever is smaller, if supported.
- *
- * Return: Zero on success, otherwise an error number.
- *
- * [0]: Only NTB_SPEED_AUTO and NTB_WIDTH_AUTO are valid inputs; other speed
- * and width input will be ignored.
- */
-int
-ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
- enum ntb_width w __unused)
+static int
+intel_ntb_link_enable(device_t dev, enum ntb_speed speed __unused,
+ enum ntb_width width __unused)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
uint32_t cntl;
- ntb_printf(2, "%s\n", __func__);
+ intel_ntb_printf(2, "%s\n", __func__);
if (ntb->type == NTB_ATOM) {
pci_write_config(ntb->device, NTB_PPD_OFFSET,
@@ -2139,57 +2004,47 @@ ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
}
if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
- ntb_link_event(ntb);
+ ntb_link_event(dev);
return (0);
}
- cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+ cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
- if (HAS_FEATURE(NTB_SPLIT_BAR))
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
cntl |= NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP;
- ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
+ intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
return (0);
}
-/*
- * ntb_link_disable() - disable the link on the secondary side of the ntb
- * @ntb: NTB device context
- *
- * Disable the link on the secondary side of the ntb. This can only be done
- * from the primary side of the ntb in primary or b2b topology. The ntb device
- * should disable the link. Returning from this call must indicate that a
- * barrier has passed, though with no more writes may pass in either direction
- * across the link, except if this call returns an error number.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_link_disable(struct ntb_softc *ntb)
+static int
+intel_ntb_link_disable(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
uint32_t cntl;
- ntb_printf(2, "%s\n", __func__);
+ intel_ntb_printf(2, "%s\n", __func__);
if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
- ntb_link_event(ntb);
+ ntb_link_event(dev);
return (0);
}
- cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+ cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
- if (HAS_FEATURE(NTB_SPLIT_BAR))
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP);
cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
- ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
+ intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
return (0);
}
-bool
-ntb_link_enabled(struct ntb_softc *ntb)
+static bool
+intel_ntb_link_enabled(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
uint32_t cntl;
if (ntb->type == NTB_ATOM) {
@@ -2200,7 +2055,7 @@ ntb_link_enabled(struct ntb_softc *ntb)
if (ntb->conn_type == NTB_CONN_TRANSPARENT)
return (true);
- cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+ cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
return ((cntl & NTB_CNTL_LINK_DISABLE) == 0);
}
@@ -2225,11 +2080,11 @@ recover_atom_link(void *arg)
if (atom_link_is_err(ntb))
goto retry;
- status32 = ntb_reg_read(4, ntb->reg->ntb_ctl);
+ status32 = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
if ((status32 & ATOM_CNTL_LINK_DOWN) != 0)
goto out;
- status32 = ntb_reg_read(4, ntb->reg->lnk_sta);
+ status32 = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
width = NTB_LNK_STA_WIDTH(status32);
speed = status32 & NTB_LINK_SPEED_MASK;
@@ -2252,18 +2107,18 @@ retry:
* Polls the HW link status register(s); returns true if something has changed.
*/
static bool
-ntb_poll_link(struct ntb_softc *ntb)
+intel_ntb_poll_link(struct ntb_softc *ntb)
{
uint32_t ntb_cntl;
uint16_t reg_val;
if (ntb->type == NTB_ATOM) {
- ntb_cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+ ntb_cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
if (ntb_cntl == ntb->ntb_ctl)
return (false);
ntb->ntb_ctl = ntb_cntl;
- ntb->lnk_sta = ntb_reg_read(4, ntb->reg->lnk_sta);
+ ntb->lnk_sta = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
} else {
db_iowrite_raw(ntb, ntb->self_reg->db_bell, ntb->db_link_mask);
@@ -2273,11 +2128,11 @@ ntb_poll_link(struct ntb_softc *ntb)
ntb->lnk_sta = reg_val;
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
if (_xeon_link_is_up(ntb)) {
if (!ntb->peer_msix_good) {
callout_reset(&ntb->peer_msix_work, 0,
- ntb_exchange_msix, ntb);
+ intel_ntb_exchange_msix, ntb);
return (false);
}
} else {
@@ -2290,7 +2145,7 @@ ntb_poll_link(struct ntb_softc *ntb)
}
static inline enum ntb_speed
-ntb_link_sta_speed(struct ntb_softc *ntb)
+intel_ntb_link_sta_speed(struct ntb_softc *ntb)
{
if (!link_is_up(ntb))
@@ -2299,7 +2154,7 @@ ntb_link_sta_speed(struct ntb_softc *ntb)
}
static inline enum ntb_width
-ntb_link_sta_width(struct ntb_softc *ntb)
+intel_ntb_link_sta_width(struct ntb_softc *ntb)
{
if (!link_is_up(ntb))
@@ -2321,7 +2176,7 @@ SYSCTL_NODE(_hw_ntb, OID_AUTO, debug_info, CTLFLAG_RW, 0,
#define NTB_REGFLAGS_MASK (NTB_REGSZ_MASK | NTB_DB_READ | NTB_PCI_REG)
static void
-ntb_sysctl_init(struct ntb_softc *ntb)
+intel_ntb_sysctl_init(struct ntb_softc *ntb)
{
struct sysctl_oid_list *globals, *tree_par, *regpar, *statpar, *errpar;
struct sysctl_ctx_list *ctx;
@@ -2424,7 +2279,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_64 | ntb->xlat_reg->bar2_xlat,
sysctl_handle_register, "QU", "Incoming XLAT23 register");
- if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_xlat4",
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_32 | ntb->xlat_reg->bar4_xlat,
@@ -2444,7 +2299,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_64 | ntb->xlat_reg->bar2_limit,
sysctl_handle_register, "QU", "Incoming LMT23 register");
- if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_lmt4",
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_32 | ntb->xlat_reg->bar4_limit,
@@ -2535,7 +2390,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_64 | ntb->bar_info[NTB_B2B_BAR_1].pbarxlat_off,
sysctl_handle_register, "QU", "Outgoing XLAT23 register");
- if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_xlat4",
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_32 | ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off,
@@ -2555,7 +2410,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_64 | XEON_PBAR2LMT_OFFSET,
sysctl_handle_register, "QU", "Outgoing LMT23 register");
- if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_lmt4",
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_32 | XEON_PBAR4LMT_OFFSET,
@@ -2579,7 +2434,7 @@ ntb_sysctl_init(struct ntb_softc *ntb)
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_64 | ntb->xlat_reg->bar2_base,
sysctl_handle_register, "QU", "Secondary BAR23 base register");
- if (HAS_FEATURE(NTB_SPLIT_BAR)) {
+ if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar4_base",
CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
NTB_REG_32 | ntb->xlat_reg->bar4_base,
@@ -2602,13 +2457,10 @@ ntb_sysctl_init(struct ntb_softc *ntb)
static int
sysctl_handle_features(SYSCTL_HANDLER_ARGS)
{
- struct ntb_softc *ntb;
+ struct ntb_softc *ntb = arg1;
struct sbuf sb;
int error;
- error = 0;
- ntb = arg1;
-
sbuf_new_for_sysctl(&sb, NULL, 256, req);
sbuf_printf(&sb, "%b", ntb->features, NTB_FEATURES_STR);
@@ -2623,14 +2475,11 @@ sysctl_handle_features(SYSCTL_HANDLER_ARGS)
static int
sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
{
- struct ntb_softc *ntb;
+ struct ntb_softc *ntb = arg1;
unsigned old, new;
int error;
- error = 0;
- ntb = arg1;
-
- old = ntb_link_enabled(ntb);
+ old = intel_ntb_link_enabled(ntb->device);
error = SYSCTL_OUT(req, &old, sizeof(old));
if (error != 0 || req->newptr == NULL)
@@ -2640,31 +2489,28 @@ sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
if (error != 0)
return (error);
- ntb_printf(0, "Admin set interface state to '%sabled'\n",
+ intel_ntb_printf(0, "Admin set interface state to '%sabled'\n",
(new != 0)? "en" : "dis");
if (new != 0)
- error = ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+ error = intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
else
- error = ntb_link_disable(ntb);
+ error = intel_ntb_link_disable(ntb->device);
return (error);
}
static int
sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
{
- struct ntb_softc *ntb;
+ struct ntb_softc *ntb = arg1;
struct sbuf sb;
enum ntb_speed speed;
enum ntb_width width;
int error;
- error = 0;
- ntb = arg1;
-
sbuf_new_for_sysctl(&sb, NULL, 32, req);
- if (ntb_link_is_up(ntb, &speed, &width))
+ if (intel_ntb_link_is_up(ntb->device, &speed, &width))
sbuf_printf(&sb, "up / PCIe Gen %u / Width x%u",
(unsigned)speed, (unsigned)width);
else
@@ -2681,14 +2527,11 @@ sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
static int
sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
{
- struct ntb_softc *ntb;
+ struct ntb_softc *ntb = arg1;
unsigned res;
int error;
- error = 0;
- ntb = arg1;
-
- res = ntb_link_is_up(ntb, NULL, NULL);
+ res = intel_ntb_link_is_up(ntb->device, NULL, NULL);
error = SYSCTL_OUT(req, &res, sizeof(res));
if (error || !req->newptr)
@@ -2727,28 +2570,28 @@ sysctl_handle_register(SYSCTL_HANDLER_ARGS)
if (pci)
umv = pci_read_config(ntb->device, reg, 8);
else
- umv = ntb_reg_read(8, reg);
+ umv = intel_ntb_reg_read(8, reg);
outsz = sizeof(uint64_t);
break;
case NTB_REG_32:
if (pci)
umv = pci_read_config(ntb->device, reg, 4);
else
- umv = ntb_reg_read(4, reg);
+ umv = intel_ntb_reg_read(4, reg);
outsz = sizeof(uint32_t);
break;
case NTB_REG_16:
if (pci)
umv = pci_read_config(ntb->device, reg, 2);
else
- umv = ntb_reg_read(2, reg);
+ umv = intel_ntb_reg_read(2, reg);
outsz = sizeof(uint16_t);
break;
case NTB_REG_8:
if (pci)
umv = pci_read_config(ntb->device, reg, 1);
else
- umv = ntb_reg_read(1, reg);
+ umv = intel_ntb_reg_read(1, reg);
outsz = sizeof(uint8_t);
break;
default:
@@ -2768,7 +2611,7 @@ sysctl_handle_register(SYSCTL_HANDLER_ARGS)
}
static unsigned
-ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
+intel_ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
{
if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
@@ -2782,8 +2625,21 @@ ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
return (uidx);
}
+#ifndef EARLY_AP_STARTUP
+static int msix_ready;
+
+static void
+intel_ntb_msix_ready(void *arg __unused)
+{
+
+ msix_ready = 1;
+}
+SYSINIT(intel_ntb_msix_ready, SI_SUB_SMP, SI_ORDER_ANY,
+ intel_ntb_msix_ready, NULL);
+#endif
+
static void
-ntb_exchange_msix(void *ctx)
+intel_ntb_exchange_msix(void *ctx)
{
struct ntb_softc *ntb;
uint32_t val;
@@ -2796,42 +2652,50 @@ ntb_exchange_msix(void *ctx)
if (ntb->peer_msix_done)
goto msix_done;
+#ifndef EARLY_AP_STARTUP
+ /* Block MSIX negotiation until SMP started and IRQ reshuffled. */
+ if (!msix_ready)
+ goto reschedule;
+#endif
+
+ intel_ntb_get_msix_info(ntb);
for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
- ntb_peer_spad_write(ntb, NTB_MSIX_DATA0 + i,
+ intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DATA0 + i,
ntb->msix_data[i].nmd_data);
- ntb_peer_spad_write(ntb, NTB_MSIX_OFS0 + i,
+ intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_OFS0 + i,
ntb->msix_data[i].nmd_ofs - ntb->msix_xlat);
}
- ntb_peer_spad_write(ntb, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
+ intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
- ntb_spad_read(ntb, NTB_MSIX_GUARD, &val);
+ intel_ntb_spad_read(ntb->device, NTB_MSIX_GUARD, &val);
if (val != NTB_MSIX_VER_GUARD)
goto reschedule;
for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
- ntb_spad_read(ntb, NTB_MSIX_DATA0 + i, &val);
- ntb_printf(2, "remote MSIX data(%u): 0x%x\n", i, val);
+ intel_ntb_spad_read(ntb->device, NTB_MSIX_DATA0 + i, &val);
+ intel_ntb_printf(2, "remote MSIX data(%u): 0x%x\n", i, val);
ntb->peer_msix_data[i].nmd_data = val;
- ntb_spad_read(ntb, NTB_MSIX_OFS0 + i, &val);
- ntb_printf(2, "remote MSIX addr(%u): 0x%x\n", i, val);
+ intel_ntb_spad_read(ntb->device, NTB_MSIX_OFS0 + i, &val);
+ intel_ntb_printf(2, "remote MSIX addr(%u): 0x%x\n", i, val);
ntb->peer_msix_data[i].nmd_ofs = val;
}
ntb->peer_msix_done = true;
msix_done:
- ntb_peer_spad_write(ntb, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
- ntb_spad_read(ntb, NTB_MSIX_DONE, &val);
+ intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
+ intel_ntb_spad_read(ntb->device, NTB_MSIX_DONE, &val);
if (val != NTB_MSIX_RECEIVED)
goto reschedule;
+ intel_ntb_spad_clear(ntb->device);
ntb->peer_msix_good = true;
/* Give peer time to see our NTB_MSIX_RECEIVED. */
goto reschedule;
msix_good:
- ntb_poll_link(ntb);
- ntb_link_event(ntb);
+ intel_ntb_poll_link(ntb);
+ ntb_link_event(ntb->device);
return;
reschedule:
@@ -2839,40 +2703,27 @@ reschedule:
if (_xeon_link_is_up(ntb)) {
callout_reset(&ntb->peer_msix_work,
hz * (ntb->peer_msix_good ? 2 : 1) / 100,
- ntb_exchange_msix, ntb);
+ intel_ntb_exchange_msix, ntb);
} else
- ntb_spad_clear(ntb);
+ intel_ntb_spad_clear(ntb->device);
}
/*
* Public API to the rest of the OS
*/
-/**
- * ntb_get_max_spads() - get the total scratch regs usable
- * @ntb: pointer to ntb_softc instance
- *
- * This function returns the max 32bit scratchpad registers usable by the
- * upper layer.
- *
- * RETURNS: total number of scratch pad registers available
- */
-uint8_t
-ntb_get_max_spads(struct ntb_softc *ntb)
+static uint8_t
+intel_ntb_spad_count(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
return (ntb->spad_count);
}
-/*
- * ntb_mw_count() - Get the number of memory windows available for KPI
- * consumers.
- *
- * (Excludes any MW wholly reserved for register access.)
- */
-uint8_t
-ntb_mw_count(struct ntb_softc *ntb)
+static uint8_t
+intel_ntb_mw_count(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
uint8_t res;
res = ntb->mw_count;
@@ -2883,25 +2734,15 @@ ntb_mw_count(struct ntb_softc *ntb)
return (res);
}
-/**
- * ntb_spad_write() - write to the secondary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
+static int
+intel_ntb_spad_write(device_t dev, unsigned int idx, uint32_t val)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
if (idx >= ntb->spad_count)
return (EINVAL);
- ntb_reg_write(4, ntb->self_reg->spad + idx * 4, val);
+ intel_ntb_reg_write(4, ntb->self_reg->spad + idx * 4, val);
return (0);
}
@@ -2909,122 +2750,77 @@ ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
/*
* Zeros the local scratchpad.
*/
-void
-ntb_spad_clear(struct ntb_softc *ntb)
+static void
+intel_ntb_spad_clear(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
unsigned i;
for (i = 0; i < ntb->spad_count; i++)
- ntb_spad_write(ntb, i, 0);
+ intel_ntb_spad_write(dev, i, 0);
}
-/**
- * ntb_spad_read() - read from the primary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val)
+static int
+intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
if (idx >= ntb->spad_count)
return (EINVAL);
- *val = ntb_reg_read(4, ntb->self_reg->spad + idx * 4);
+ *val = intel_ntb_reg_read(4, ntb->self_reg->spad + idx * 4);
return (0);
}
-/**
- * ntb_peer_spad_write() - write to the secondary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
+static int
+intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
if (idx >= ntb->spad_count)
return (EINVAL);
- if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP))
- ntb_mw_write(4, XEON_SPAD_OFFSET + idx * 4, val);
+ if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
+ intel_ntb_mw_write(4, XEON_SPAD_OFFSET + idx * 4, val);
else
- ntb_reg_write(4, ntb->peer_reg->spad + idx * 4, val);
+ intel_ntb_reg_write(4, ntb->peer_reg->spad + idx * 4, val);
return (0);
}
-/**
- * ntb_peer_spad_read() - read from the primary scratchpad register
- * @ntb: pointer to ntb_softc instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.
- *
- * RETURNS: An appropriate ERRNO error value on error, or zero for success.
- */
-int
-ntb_peer_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val)
+static int
+intel_ntb_peer_spad_read(device_t dev, unsigned int idx, uint32_t *val)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
if (idx >= ntb->spad_count)
return (EINVAL);
- if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP))
- *val = ntb_mw_read(4, XEON_SPAD_OFFSET + idx * 4);
+ if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
+ *val = intel_ntb_mw_read(4, XEON_SPAD_OFFSET + idx * 4);
else
- *val = ntb_reg_read(4, ntb->peer_reg->spad + idx * 4);
+ *val = intel_ntb_reg_read(4, ntb->peer_reg->spad + idx * 4);
return (0);
}
-/*
- * ntb_mw_get_range() - get the range of a memory window
- * @ntb: NTB device context
- * @idx: Memory window number
- * @base: OUT - the base address for mapping the memory window
- * @size: OUT - the size for mapping the memory window
- * @align: OUT - the base alignment for translating the memory window
- * @align_size: OUT - the size alignment for translating the memory window
- *
- * Get the range of a memory window. NULL may be given for any output
- * parameter if the value is not needed. The base and size may be used for
- * mapping the memory window, to access the peer memory. The alignment and
- * size may be used for translating the memory window, for the peer to access
- * memory on the local system.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_get_range(struct ntb_softc *ntb, unsigned mw_idx, vm_paddr_t *base,
+static int
+intel_ntb_mw_get_range(device_t dev, unsigned mw_idx, vm_paddr_t *base,
caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
bus_addr_t *plimit)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
struct ntb_pci_bar_info *bar;
bus_addr_t limit;
size_t bar_b2b_off;
enum ntb_bar bar_num;
- if (mw_idx >= ntb_mw_count(ntb))
+ if (mw_idx >= intel_ntb_mw_count(dev))
return (EINVAL);
- mw_idx = ntb_user_mw_to_idx(ntb, mw_idx);
+ mw_idx = intel_ntb_user_mw_to_idx(ntb, mw_idx);
- bar_num = ntb_mw_to_bar(ntb, mw_idx);
+ bar_num = intel_ntb_mw_to_bar(ntb, mw_idx);
bar = &ntb->bar_info[bar_num];
bar_b2b_off = 0;
if (mw_idx == ntb->b2b_mw_idx) {
@@ -3053,37 +2849,21 @@ ntb_mw_get_range(struct ntb_softc *ntb, unsigned mw_idx, vm_paddr_t *base,
return (0);
}
-/*
- * ntb_mw_set_trans() - set the translation of a memory window
- * @ntb: NTB device context
- * @idx: Memory window number
- * @addr: The dma address local memory to expose to the peer
- * @size: The size of the local memory to expose to the peer
- *
- * Set the translation of a memory window. The peer may access local memory
- * through the window starting at the address, up to the size. The address
- * must be aligned to the alignment specified by ntb_mw_get_range(). The size
- * must be aligned to the size alignment specified by ntb_mw_get_range(). The
- * address must be below the plimit specified by ntb_mw_get_range() (i.e. for
- * 32-bit BARs).
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
- size_t size)
+static int
+intel_ntb_mw_set_trans(device_t dev, unsigned idx, bus_addr_t addr, size_t size)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
struct ntb_pci_bar_info *bar;
uint64_t base, limit, reg_val;
size_t bar_size, mw_size;
uint32_t base_reg, xlat_reg, limit_reg;
enum ntb_bar bar_num;
- if (idx >= ntb_mw_count(ntb))
+ if (idx >= intel_ntb_mw_count(dev))
return (EINVAL);
- idx = ntb_user_mw_to_idx(ntb, idx);
+ idx = intel_ntb_user_mw_to_idx(ntb, idx);
- bar_num = ntb_mw_to_bar(ntb, idx);
+ bar_num = intel_ntb_mw_to_bar(ntb, idx);
bar = &ntb->bar_info[bar_num];
bar_size = bar->size;
@@ -3103,25 +2883,25 @@ ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
limit = 0;
if (bar_is_64bit(ntb, bar_num)) {
- base = ntb_reg_read(8, base_reg) & BAR_HIGH_MASK;
+ base = intel_ntb_reg_read(8, base_reg) & BAR_HIGH_MASK;
if (limit_reg != 0 && size != mw_size)
limit = base + size;
/* Set and verify translation address */
- ntb_reg_write(8, xlat_reg, addr);
- reg_val = ntb_reg_read(8, xlat_reg) & BAR_HIGH_MASK;
+ intel_ntb_reg_write(8, xlat_reg, addr);
+ reg_val = intel_ntb_reg_read(8, xlat_reg) & BAR_HIGH_MASK;
if (reg_val != addr) {
- ntb_reg_write(8, xlat_reg, 0);
+ intel_ntb_reg_write(8, xlat_reg, 0);
return (EIO);
}
/* Set and verify the limit */
- ntb_reg_write(8, limit_reg, limit);
- reg_val = ntb_reg_read(8, limit_reg) & BAR_HIGH_MASK;
+ intel_ntb_reg_write(8, limit_reg, limit);
+ reg_val = intel_ntb_reg_read(8, limit_reg) & BAR_HIGH_MASK;
if (reg_val != limit) {
- ntb_reg_write(8, limit_reg, base);
- ntb_reg_write(8, xlat_reg, 0);
+ intel_ntb_reg_write(8, limit_reg, base);
+ intel_ntb_reg_write(8, xlat_reg, 0);
return (EIO);
}
} else {
@@ -3132,98 +2912,72 @@ ntb_mw_set_trans(struct ntb_softc *ntb, unsigned idx, bus_addr_t addr,
if (((addr + size) & UINT32_MAX) != (addr + size))
return (ERANGE);
- base = ntb_reg_read(4, base_reg) & BAR_HIGH_MASK;
+ base = intel_ntb_reg_read(4, base_reg) & BAR_HIGH_MASK;
if (limit_reg != 0 && size != mw_size)
limit = base + size;
/* Set and verify translation address */
- ntb_reg_write(4, xlat_reg, addr);
- reg_val = ntb_reg_read(4, xlat_reg) & BAR_HIGH_MASK;
+ intel_ntb_reg_write(4, xlat_reg, addr);
+ reg_val = intel_ntb_reg_read(4, xlat_reg) & BAR_HIGH_MASK;
if (reg_val != addr) {
- ntb_reg_write(4, xlat_reg, 0);
+ intel_ntb_reg_write(4, xlat_reg, 0);
return (EIO);
}
/* Set and verify the limit */
- ntb_reg_write(4, limit_reg, limit);
- reg_val = ntb_reg_read(4, limit_reg) & BAR_HIGH_MASK;
+ intel_ntb_reg_write(4, limit_reg, limit);
+ reg_val = intel_ntb_reg_read(4, limit_reg) & BAR_HIGH_MASK;
if (reg_val != limit) {
- ntb_reg_write(4, limit_reg, base);
- ntb_reg_write(4, xlat_reg, 0);
+ intel_ntb_reg_write(4, limit_reg, base);
+ intel_ntb_reg_write(4, xlat_reg, 0);
return (EIO);
}
}
return (0);
}
-/*
- * ntb_mw_clear_trans() - clear the translation of a memory window
- * @ntb: NTB device context
- * @idx: Memory window number
- *
- * Clear the translation of a memory window. The peer may no longer access
- * local memory through the window.
- *
- * Return: Zero on success, otherwise an error number.
- */
-int
-ntb_mw_clear_trans(struct ntb_softc *ntb, unsigned mw_idx)
+static int
+intel_ntb_mw_clear_trans(device_t dev, unsigned mw_idx)
{
- return (ntb_mw_set_trans(ntb, mw_idx, 0, 0));
+ return (intel_ntb_mw_set_trans(dev, mw_idx, 0, 0));
}
-/*
- * ntb_mw_get_wc - Get the write-combine status of a memory window
- *
- * Returns: Zero on success, setting *wc; otherwise an error number (e.g. if
- * idx is an invalid memory window).
- *
- * Mode is a VM_MEMATTR_* type.
- */
-int
-ntb_mw_get_wc(struct ntb_softc *ntb, unsigned idx, vm_memattr_t *mode)
+static int
+intel_ntb_mw_get_wc(device_t dev, unsigned idx, vm_memattr_t *mode)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
struct ntb_pci_bar_info *bar;
- if (idx >= ntb_mw_count(ntb))
+ if (idx >= intel_ntb_mw_count(dev))
return (EINVAL);
- idx = ntb_user_mw_to_idx(ntb, idx);
+ idx = intel_ntb_user_mw_to_idx(ntb, idx);
- bar = &ntb->bar_info[ntb_mw_to_bar(ntb, idx)];
+ bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
*mode = bar->map_mode;
return (0);
}
-/*
- * ntb_mw_set_wc - Set the write-combine status of a memory window
- *
- * If 'mode' matches the current status, this does nothing and succeeds. Mode
- * is a VM_MEMATTR_* type.
- *
- * Returns: Zero on success, setting the caching attribute on the virtual
- * mapping of the BAR; otherwise an error number (e.g. if idx is an invalid
- * memory window, or if changing the caching attribute fails).
- */
-int
-ntb_mw_set_wc(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
+static int
+intel_ntb_mw_set_wc(device_t dev, unsigned idx, vm_memattr_t mode)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
- if (idx >= ntb_mw_count(ntb))
+ if (idx >= intel_ntb_mw_count(dev))
return (EINVAL);
- idx = ntb_user_mw_to_idx(ntb, idx);
- return (ntb_mw_set_wc_internal(ntb, idx, mode));
+ idx = intel_ntb_user_mw_to_idx(ntb, idx);
+ return (intel_ntb_mw_set_wc_internal(ntb, idx, mode));
}
static int
-ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
+intel_ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
{
struct ntb_pci_bar_info *bar;
int rc;
- bar = &ntb->bar_info[ntb_mw_to_bar(ntb, idx)];
+ bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
if (bar->map_mode == mode)
return (0);
@@ -3234,26 +2988,19 @@ ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
return (rc);
}
-/**
- * ntb_peer_db_set() - Set the doorbell on the secondary/external side
- * @ntb: pointer to ntb_softc instance
- * @bit: doorbell bits to ring
- *
- * This function allows triggering of a doorbell on the secondary/external
- * side that will initiate an interrupt on the remote host
- */
-void
-ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
+static void
+intel_ntb_peer_db_set(device_t dev, uint64_t bit)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
- if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
struct ntb_pci_bar_info *lapic;
unsigned i;
lapic = ntb->peer_lapic_bar;
for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
- if ((bit & ntb_db_vector_mask(ntb, i)) != 0)
+ if ((bit & intel_ntb_db_vector_mask(dev, i)) != 0)
bus_space_write_4(lapic->pci_bus_tag,
lapic->pci_bus_handle,
ntb->peer_msix_data[i].nmd_ofs,
@@ -3262,99 +3009,76 @@ ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
return;
}
- if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
- ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
+ if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
+ intel_ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
return;
}
db_iowrite(ntb, ntb->peer_reg->db_bell, bit);
}
-/*
- * ntb_get_peer_db_addr() - Return the address of the remote doorbell register,
- * as well as the size of the register (via *sz_out).
- *
- * This function allows a caller using I/OAT DMA to chain the remote doorbell
- * ring to its memory window write.
- *
- * Note that writing the peer doorbell via a memory window will *not* generate
- * an interrupt on the remote host; that must be done seperately.
- */
-bus_addr_t
-ntb_get_peer_db_addr(struct ntb_softc *ntb, vm_size_t *sz_out)
+static int
+intel_ntb_peer_db_addr(device_t dev, bus_addr_t *db_addr, vm_size_t *db_size)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
struct ntb_pci_bar_info *bar;
uint64_t regoff;
- KASSERT(sz_out != NULL, ("must be non-NULL"));
+ KASSERT((db_addr != NULL && db_size != NULL), ("must be non-NULL"));
- if (!HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+ if (!HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
bar = &ntb->bar_info[NTB_CONFIG_BAR];
regoff = ntb->peer_reg->db_bell;
} else {
KASSERT(ntb->b2b_mw_idx != B2B_MW_DISABLED,
("invalid b2b idx"));
- bar = &ntb->bar_info[ntb_mw_to_bar(ntb, ntb->b2b_mw_idx)];
+ bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx)];
regoff = XEON_PDOORBELL_OFFSET;
}
KASSERT(bar->pci_bus_tag != X86_BUS_SPACE_IO, ("uh oh"));
- *sz_out = ntb->reg->db_size;
/* HACK: Specific to current x86 bus implementation. */
- return ((uint64_t)bar->pci_bus_handle + regoff);
+ *db_addr = ((uint64_t)bar->pci_bus_handle + regoff);
+ *db_size = ntb->reg->db_size;
+ return (0);
}
-/*
- * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
- * @ntb: NTB device context
- *
- * Hardware may support different number or arrangement of doorbell bits.
- *
- * Return: A mask of doorbell bits supported by the ntb.
- */
-uint64_t
-ntb_db_valid_mask(struct ntb_softc *ntb)
+static uint64_t
+intel_ntb_db_valid_mask(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
return (ntb->db_valid_mask);
}
-/*
- * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
- * @ntb: NTB device context
- * @vector: Doorbell vector number
- *
- * Each interrupt vector may have a different number or arrangement of bits.
- *
- * Return: A mask of doorbell bits serviced by a vector.
- */
-uint64_t
-ntb_db_vector_mask(struct ntb_softc *ntb, uint32_t vector)
+static int
+intel_ntb_db_vector_count(device_t dev)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
+
+ return (ntb->db_vec_count);
+}
+
+static uint64_t
+intel_ntb_db_vector_mask(device_t dev, uint32_t vector)
+{
+ struct ntb_softc *ntb = device_get_softc(dev);
if (vector > ntb->db_vec_count)
return (0);
- return (ntb->db_valid_mask & ntb_vec_mask(ntb, vector));
+ return (ntb->db_valid_mask & intel_ntb_vec_mask(ntb, vector));
}
-/**
- * ntb_link_is_up() - get the current ntb link state
- * @ntb: NTB device context
- * @speed: OUT - The link speed expressed as PCIe generation number
- * @width: OUT - The link width expressed as the number of PCIe lanes
- *
- * RETURNS: true or false based on the hardware link state
- */
-bool
-ntb_link_is_up(struct ntb_softc *ntb, enum ntb_speed *speed,
- enum ntb_width *width)
+static bool
+intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed, enum ntb_width *width)
{
+ struct ntb_softc *ntb = device_get_softc(dev);
if (speed != NULL)
- *speed = ntb_link_sta_speed(ntb);
+ *speed = intel_ntb_link_sta_speed(ntb);
if (width != NULL)
- *width = ntb_link_sta_width(ntb);
+ *width = intel_ntb_link_sta_width(ntb);
return (link_is_up(ntb));
}
@@ -3369,17 +3093,42 @@ save_bar_parameters(struct ntb_pci_bar_info *bar)
bar->vbase = rman_get_virtual(bar->pci_resource);
}
-device_t
-ntb_get_device(struct ntb_softc *ntb)
-{
-
- return (ntb->device);
-}
-
-/* Export HW-specific errata information. */
-bool
-ntb_has_feature(struct ntb_softc *ntb, uint32_t feature)
-{
+static device_method_t ntb_intel_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, intel_ntb_probe),
+ DEVMETHOD(device_attach, intel_ntb_attach),
+ DEVMETHOD(device_detach, intel_ntb_detach),
+ /* NTB interface */
+ DEVMETHOD(ntb_link_is_up, intel_ntb_link_is_up),
+ DEVMETHOD(ntb_link_enable, intel_ntb_link_enable),
+ DEVMETHOD(ntb_link_disable, intel_ntb_link_disable),
+ DEVMETHOD(ntb_link_enabled, intel_ntb_link_enabled),
+ DEVMETHOD(ntb_mw_count, intel_ntb_mw_count),
+ DEVMETHOD(ntb_mw_get_range, intel_ntb_mw_get_range),
+ DEVMETHOD(ntb_mw_set_trans, intel_ntb_mw_set_trans),
+ DEVMETHOD(ntb_mw_clear_trans, intel_ntb_mw_clear_trans),
+ DEVMETHOD(ntb_mw_get_wc, intel_ntb_mw_get_wc),
+ DEVMETHOD(ntb_mw_set_wc, intel_ntb_mw_set_wc),
+ DEVMETHOD(ntb_spad_count, intel_ntb_spad_count),
+ DEVMETHOD(ntb_spad_clear, intel_ntb_spad_clear),
+ DEVMETHOD(ntb_spad_write, intel_ntb_spad_write),
+ DEVMETHOD(ntb_spad_read, intel_ntb_spad_read),
+ DEVMETHOD(ntb_peer_spad_write, intel_ntb_peer_spad_write),
+ DEVMETHOD(ntb_peer_spad_read, intel_ntb_peer_spad_read),
+ DEVMETHOD(ntb_db_valid_mask, intel_ntb_db_valid_mask),
+ DEVMETHOD(ntb_db_vector_count, intel_ntb_db_vector_count),
+ DEVMETHOD(ntb_db_vector_mask, intel_ntb_db_vector_mask),
+ DEVMETHOD(ntb_db_clear, intel_ntb_db_clear),
+ DEVMETHOD(ntb_db_clear_mask, intel_ntb_db_clear_mask),
+ DEVMETHOD(ntb_db_read, intel_ntb_db_read),
+ DEVMETHOD(ntb_db_set_mask, intel_ntb_db_set_mask),
+ DEVMETHOD(ntb_peer_db_addr, intel_ntb_peer_db_addr),
+ DEVMETHOD(ntb_peer_db_set, intel_ntb_peer_db_set),
+ DEVMETHOD_END
+};
- return (HAS_FEATURE(feature));
-}
+static DEFINE_CLASS_0(ntb_hw, ntb_intel_driver, ntb_intel_methods,
+ sizeof(struct ntb_softc));
+DRIVER_MODULE(ntb_intel, pci, ntb_intel_driver, ntb_hw_devclass, NULL, NULL);
+MODULE_DEPEND(ntb_intel, ntb, 1, 1, 1);
+MODULE_VERSION(ntb_intel, 1);
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.h b/sys/dev/ntb/ntb_hw/ntb_hw.h
deleted file mode 100644
index f05acda..0000000
--- a/sys/dev/ntb/ntb_hw/ntb_hw.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*-
- * Copyright (C) 2013 Intel Corporation
- * Copyright (C) 2015 EMC Corporation
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _NTB_HW_H_
-#define _NTB_HW_H_
-
-struct ntb_softc;
-
-#define NTB_MAX_NUM_MW 3
-
-enum ntb_speed {
- NTB_SPEED_AUTO = -1,
- NTB_SPEED_NONE = 0,
- NTB_SPEED_GEN1 = 1,
- NTB_SPEED_GEN2 = 2,
- NTB_SPEED_GEN3 = 3,
-};
-
-enum ntb_width {
- NTB_WIDTH_AUTO = -1,
- NTB_WIDTH_NONE = 0,
- NTB_WIDTH_1 = 1,
- NTB_WIDTH_2 = 2,
- NTB_WIDTH_4 = 4,
- NTB_WIDTH_8 = 8,
- NTB_WIDTH_12 = 12,
- NTB_WIDTH_16 = 16,
- NTB_WIDTH_32 = 32,
-};
-
-SYSCTL_DECL(_hw_ntb);
-
-typedef void (*ntb_db_callback)(void *data, uint32_t vector);
-typedef void (*ntb_event_callback)(void *data);
-
-struct ntb_ctx_ops {
- ntb_event_callback link_event;
- ntb_db_callback db_event;
-};
-
-device_t ntb_get_device(struct ntb_softc *);
-
-bool ntb_link_is_up(struct ntb_softc *, enum ntb_speed *, enum ntb_width *);
-void ntb_link_event(struct ntb_softc *);
-int ntb_link_enable(struct ntb_softc *, enum ntb_speed, enum ntb_width);
-int ntb_link_disable(struct ntb_softc *);
-bool ntb_link_enabled(struct ntb_softc *);
-
-int ntb_set_ctx(struct ntb_softc *, void *, const struct ntb_ctx_ops *);
-void *ntb_get_ctx(struct ntb_softc *, const struct ntb_ctx_ops **);
-void ntb_clear_ctx(struct ntb_softc *);
-
-uint8_t ntb_mw_count(struct ntb_softc *);
-int ntb_mw_get_range(struct ntb_softc *, unsigned mw_idx, vm_paddr_t *base,
- caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
- bus_addr_t *plimit);
-int ntb_mw_set_trans(struct ntb_softc *, unsigned mw_idx, bus_addr_t, size_t);
-int ntb_mw_clear_trans(struct ntb_softc *, unsigned mw_idx);
-
-int ntb_mw_get_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t *mode);
-int ntb_mw_set_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t mode);
-
-uint8_t ntb_get_max_spads(struct ntb_softc *ntb);
-void ntb_spad_clear(struct ntb_softc *ntb);
-int ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val);
-int ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val);
-int ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx,
- uint32_t val);
-int ntb_peer_spad_read(struct ntb_softc *ntb, unsigned int idx,
- uint32_t *val);
-
-uint64_t ntb_db_valid_mask(struct ntb_softc *);
-uint64_t ntb_db_vector_mask(struct ntb_softc *, uint32_t vector);
-bus_addr_t ntb_get_peer_db_addr(struct ntb_softc *, vm_size_t *sz_out);
-
-void ntb_db_clear(struct ntb_softc *, uint64_t bits);
-void ntb_db_clear_mask(struct ntb_softc *, uint64_t bits);
-uint64_t ntb_db_read(struct ntb_softc *);
-void ntb_db_set_mask(struct ntb_softc *, uint64_t bits);
-void ntb_peer_db_set(struct ntb_softc *, uint64_t bits);
-
-#define XEON_SPAD_COUNT 16
-#define ATOM_SPAD_COUNT 16
-
-/* Hardware owns the low 16 bits of features. */
-#define NTB_BAR_SIZE_4K (1 << 0)
-#define NTB_SDOORBELL_LOCKUP (1 << 1)
-#define NTB_SB01BASE_LOCKUP (1 << 2)
-#define NTB_B2BDOORBELL_BIT14 (1 << 3)
-/* Software/configuration owns the top 16 bits. */
-#define NTB_SPLIT_BAR (1ull << 16)
-
-#define NTB_FEATURES_STR \
- "\20\21SPLIT_BAR4\04B2B_DOORBELL_BIT14\03SB01BASE_LOCKUP" \
- "\02SDOORBELL_LOCKUP\01BAR_SIZE_4K"
-
-bool ntb_has_feature(struct ntb_softc *, uint32_t);
-
-#endif /* _NTB_HW_H_ */
diff --git a/sys/dev/ntb/ntb_hw/ntb_regs.h b/sys/dev/ntb/ntb_hw/ntb_regs.h
index fb445d7..a037736 100644
--- a/sys/dev/ntb/ntb_hw/ntb_regs.h
+++ b/sys/dev/ntb/ntb_hw/ntb_regs.h
@@ -1,4 +1,5 @@
/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
* Copyright (C) 2013 Intel Corporation
* Copyright (C) 2015 EMC Corporation
* All rights reserved.
@@ -76,6 +77,7 @@
#define XEON_SDBMSK_OFFSET 0x0066
#define XEON_USMEMMISS_OFFSET 0x0070
#define XEON_SPAD_OFFSET 0x0080
+#define XEON_SPAD_COUNT 16
#define XEON_SPADSEMA4_OFFSET 0x00c0
#define XEON_WCCNTRL_OFFSET 0x00e0
#define XEON_UNCERRSTS_OFFSET 0x014c
@@ -104,6 +106,7 @@
#define ATOM_NTBCNTL_OFFSET 0x0060
#define ATOM_EBDF_OFFSET 0x0064
#define ATOM_SPAD_OFFSET 0x0080
+#define ATOM_SPAD_COUNT 16
#define ATOM_SPADSEMA_OFFSET 0x00c0
#define ATOM_STKYSPAD_OFFSET 0x00c4
#define ATOM_PBAR2XLAT_OFFSET 0x8008
diff --git a/sys/dev/ntb/ntb_if.m b/sys/dev/ntb/ntb_if.m
new file mode 100644
index 0000000..d8ca227
--- /dev/null
+++ b/sys/dev/ntb/ntb_if.m
@@ -0,0 +1,210 @@
+#-
+# Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+INTERFACE ntb;
+
+HEADER {
+ enum ntb_speed {
+ NTB_SPEED_AUTO = -1,
+ NTB_SPEED_NONE = 0,
+ NTB_SPEED_GEN1 = 1,
+ NTB_SPEED_GEN2 = 2,
+ NTB_SPEED_GEN3 = 3,
+ };
+
+ enum ntb_width {
+ NTB_WIDTH_AUTO = -1,
+ NTB_WIDTH_NONE = 0,
+ NTB_WIDTH_1 = 1,
+ NTB_WIDTH_2 = 2,
+ NTB_WIDTH_4 = 4,
+ NTB_WIDTH_8 = 8,
+ NTB_WIDTH_12 = 12,
+ NTB_WIDTH_16 = 16,
+ NTB_WIDTH_32 = 32,
+ };
+
+ typedef void (*ntb_db_callback)(void *data, uint32_t vector);
+ typedef void (*ntb_event_callback)(void *data);
+ struct ntb_ctx_ops {
+ ntb_event_callback link_event;
+ ntb_db_callback db_event;
+ };
+};
+
+METHOD bool link_is_up {
+ device_t ntb;
+ enum ntb_speed *speed;
+ enum ntb_width *width;
+};
+
+METHOD int link_enable {
+ device_t ntb;
+ enum ntb_speed speed;
+ enum ntb_width width;
+};
+
+METHOD int link_disable {
+ device_t ntb;
+};
+
+METHOD bool link_enabled {
+ device_t ntb;
+};
+
+METHOD int set_ctx {
+ device_t ntb;
+ void *ctx;
+ const struct ntb_ctx_ops *ctx_ops;
+};
+
+METHOD void * get_ctx {
+ device_t ntb;
+ const struct ntb_ctx_ops **ctx_ops;
+};
+
+METHOD void clear_ctx {
+ device_t ntb;
+};
+
+METHOD uint8_t mw_count {
+ device_t ntb;
+};
+
+METHOD int mw_get_range {
+ device_t ntb;
+ unsigned mw_idx;
+ vm_paddr_t *base;
+ caddr_t *vbase;
+ size_t *size;
+ size_t *align;
+ size_t *align_size;
+ bus_addr_t *plimit;
+};
+
+METHOD int mw_set_trans {
+ device_t ntb;
+ unsigned mw_idx;
+ bus_addr_t addr;
+ size_t size;
+};
+
+METHOD int mw_clear_trans {
+ device_t ntb;
+ unsigned mw_idx;
+};
+
+METHOD int mw_get_wc {
+ device_t ntb;
+ unsigned mw_idx;
+ vm_memattr_t *mode;
+};
+
+METHOD int mw_set_wc {
+ device_t ntb;
+ unsigned mw_idx;
+ vm_memattr_t mode;
+};
+
+METHOD uint8_t spad_count {
+ device_t ntb;
+};
+
+METHOD void spad_clear {
+ device_t ntb;
+};
+
+METHOD int spad_write {
+ device_t ntb;
+ unsigned int idx;
+ uint32_t val;
+};
+
+METHOD int spad_read {
+ device_t ntb;
+ unsigned int idx;
+ uint32_t *val;
+};
+
+METHOD int peer_spad_write {
+ device_t ntb;
+ unsigned int idx;
+ uint32_t val;
+};
+
+METHOD int peer_spad_read {
+ device_t ntb;
+ unsigned int idx;
+ uint32_t *val;
+};
+
+METHOD uint64_t db_valid_mask {
+ device_t ntb;
+};
+
+METHOD int db_vector_count {
+ device_t ntb;
+};
+
+METHOD uint64_t db_vector_mask {
+ device_t ntb;
+ uint32_t vector;
+};
+
+METHOD int peer_db_addr {
+ device_t ntb;
+ bus_addr_t *db_addr;
+ vm_size_t *db_size;
+};
+
+METHOD void db_clear {
+ device_t ntb;
+ uint64_t bits;
+};
+
+METHOD void db_clear_mask {
+ device_t ntb;
+ uint64_t bits;
+};
+
+METHOD uint64_t db_read {
+ device_t ntb;
+};
+
+METHOD void db_set_mask {
+ device_t ntb;
+ uint64_t bits;
+};
+
+METHOD void peer_db_set {
+ device_t ntb;
+ uint64_t bits;
+};
diff --git a/sys/dev/ntb/ntb_transport.c b/sys/dev/ntb/ntb_transport.c
new file mode 100644
index 0000000..5297db9
--- /dev/null
+++ b/sys/dev/ntb/ntb_transport.c
@@ -0,0 +1,1521 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * Copyright (C) 2013 Intel Corporation
+ * Copyright (C) 2015 EMC Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The Non-Transparent Bridge (NTB) is a device that allows you to connect
+ * two or more systems using a PCI-e links, providing remote memory access.
+ *
+ * This module contains a transport for sending and receiving messages by
+ * writing to remote memory window(s) provided by underlying NTB device.
+ *
+ * NOTE: Much of the code in this module is shared with Linux. Any patches may
+ * be picked up and redistributed in Linux with a dual GPL/BSD license.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+
+#include "ntb.h"
+#include "ntb_transport.h"
+
+#define KTR_NTB KTR_SPARE3
+
+#define NTB_TRANSPORT_VERSION 4
+
+static SYSCTL_NODE(_hw, OID_AUTO, ntb_transport, CTLFLAG_RW, 0, "ntb_transport");
+
+static unsigned g_ntb_transport_debug_level;
+TUNABLE_INT("hw.ntb_transport.debug_level", &g_ntb_transport_debug_level);
+SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, debug_level, CTLFLAG_RWTUN,
+ &g_ntb_transport_debug_level, 0,
+ "ntb_transport log level -- higher is more verbose");
+#define ntb_printf(lvl, ...) do { \
+ if ((lvl) <= g_ntb_transport_debug_level) { \
+ printf(__VA_ARGS__); \
+ } \
+} while (0)
+
+static unsigned transport_mtu = 0x10000;
+
+static uint64_t max_mw_size;
+TUNABLE_QUAD("hw.ntb_transport.max_mw_size", &max_mw_size);
+SYSCTL_UQUAD(_hw_ntb_transport, OID_AUTO, max_mw_size, CTLFLAG_RDTUN, &max_mw_size, 0,
+ "If enabled (non-zero), limit the size of large memory windows. "
+ "Both sides of the NTB MUST set the same value here.");
+
+static unsigned enable_xeon_watchdog;
+TUNABLE_INT("hw.ntb_transport.enable_xeon_watchdog", &enable_xeon_watchdog);
+SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, enable_xeon_watchdog, CTLFLAG_RDTUN,
+ &enable_xeon_watchdog, 0, "If non-zero, write a register every second to "
+ "keep a watchdog from tearing down the NTB link");
+
+STAILQ_HEAD(ntb_queue_list, ntb_queue_entry);
+
+typedef uint32_t ntb_q_idx_t;
+
+struct ntb_queue_entry {
+ /* ntb_queue list reference */
+ STAILQ_ENTRY(ntb_queue_entry) entry;
+
+ /* info on data to be transferred */
+ void *cb_data;
+ void *buf;
+ uint32_t len;
+ uint32_t flags;
+
+ struct ntb_transport_qp *qp;
+ struct ntb_payload_header *x_hdr;
+ ntb_q_idx_t index;
+};
+
+struct ntb_rx_info {
+ ntb_q_idx_t entry;
+};
+
+struct ntb_transport_qp {
+ struct ntb_transport_ctx *transport;
+ device_t dev;
+
+ void *cb_data;
+
+ bool client_ready;
+ volatile bool link_is_up;
+ uint8_t qp_num; /* Only 64 QPs are allowed. 0-63 */
+
+ struct ntb_rx_info *rx_info;
+ struct ntb_rx_info *remote_rx_info;
+
+ void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+ void *data, int len);
+ struct ntb_queue_list tx_free_q;
+ struct mtx ntb_tx_free_q_lock;
+ caddr_t tx_mw;
+ bus_addr_t tx_mw_phys;
+ ntb_q_idx_t tx_index;
+ ntb_q_idx_t tx_max_entry;
+ uint64_t tx_max_frame;
+
+ void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+ void *data, int len);
+ struct ntb_queue_list rx_post_q;
+ struct ntb_queue_list rx_pend_q;
+ /* ntb_rx_q_lock: synchronize access to rx_XXXX_q */
+ struct mtx ntb_rx_q_lock;
+ struct task rxc_db_work;
+ struct taskqueue *rxc_tq;
+ caddr_t rx_buff;
+ ntb_q_idx_t rx_index;
+ ntb_q_idx_t rx_max_entry;
+ uint64_t rx_max_frame;
+
+ void (*event_handler)(void *data, enum ntb_link_event status);
+ struct callout link_work;
+ struct callout rx_full;
+
+ uint64_t last_rx_no_buf;
+
+ /* Stats */
+ uint64_t rx_bytes;
+ uint64_t rx_pkts;
+ uint64_t rx_ring_empty;
+ uint64_t rx_err_no_buf;
+ uint64_t rx_err_oflow;
+ uint64_t rx_err_ver;
+ uint64_t tx_bytes;
+ uint64_t tx_pkts;
+ uint64_t tx_ring_full;
+ uint64_t tx_err_no_buf;
+
+ struct mtx tx_lock;
+};
+
+struct ntb_transport_mw {
+ vm_paddr_t phys_addr;
+ size_t phys_size;
+ size_t xlat_align;
+ size_t xlat_align_size;
+ bus_addr_t addr_limit;
+ /* Tx buff is off vbase / phys_addr */
+ caddr_t vbase;
+ size_t xlat_size;
+ size_t buff_size;
+ /* Rx buff is off virt_addr / dma_addr */
+ caddr_t virt_addr;
+ bus_addr_t dma_addr;
+};
+
+struct ntb_transport_child {
+ device_t dev;
+ int qpoff;
+ int qpcnt;
+ struct ntb_transport_child *next;
+};
+
+struct ntb_transport_ctx {
+ device_t dev;
+ struct ntb_transport_child *child;
+ struct ntb_transport_mw *mw_vec;
+ struct ntb_transport_qp *qp_vec;
+ unsigned mw_count;
+ unsigned qp_count;
+ uint64_t qp_bitmap;
+ volatile bool link_is_up;
+ struct callout link_work;
+ struct callout link_watchdog;
+ struct task link_cleanup;
+};
+
+enum {
+ NTBT_DESC_DONE_FLAG = 1 << 0,
+ NTBT_LINK_DOWN_FLAG = 1 << 1,
+};
+
+struct ntb_payload_header {
+ ntb_q_idx_t ver;
+ uint32_t len;
+ uint32_t flags;
+};
+
+enum {
+ /*
+ * The order of this enum is part of the remote protocol. Do not
+ * reorder without bumping protocol version (and it's probably best
+ * to keep the protocol in lock-step with the Linux NTB driver.
+ */
+ NTBT_VERSION = 0,
+ NTBT_QP_LINKS,
+ NTBT_NUM_QPS,
+ NTBT_NUM_MWS,
+ /*
+ * N.B.: transport_link_work assumes MW1 enums = MW0 + 2.
+ */
+ NTBT_MW0_SZ_HIGH,
+ NTBT_MW0_SZ_LOW,
+ NTBT_MW1_SZ_HIGH,
+ NTBT_MW1_SZ_LOW,
+
+ /*
+ * Some NTB-using hardware have a watchdog to work around NTB hangs; if
+ * a register or doorbell isn't written every few seconds, the link is
+ * torn down. Write an otherwise unused register every few seconds to
+ * work around this watchdog.
+ */
+ NTBT_WATCHDOG_SPAD = 15
+};
+
+#define QP_TO_MW(nt, qp) ((qp) % nt->mw_count)
+#define NTB_QP_DEF_NUM_ENTRIES 100
+#define NTB_LINK_DOWN_TIMEOUT 10
+
+static int ntb_transport_probe(device_t dev);
+static int ntb_transport_attach(device_t dev);
+static int ntb_transport_detach(device_t dev);
+static void ntb_transport_init_queue(struct ntb_transport_ctx *nt,
+ unsigned int qp_num);
+static int ntb_process_tx(struct ntb_transport_qp *qp,
+ struct ntb_queue_entry *entry);
+static void ntb_transport_rxc_db(void *arg, int pending);
+static int ntb_process_rxc(struct ntb_transport_qp *qp);
+static void ntb_memcpy_rx(struct ntb_transport_qp *qp,
+ struct ntb_queue_entry *entry, void *offset);
+static inline void ntb_rx_copy_callback(struct ntb_transport_qp *qp,
+ void *data);
+static void ntb_complete_rxc(struct ntb_transport_qp *qp);
+static void ntb_transport_doorbell_callback(void *data, uint32_t vector);
+static void ntb_transport_event_callback(void *data);
+static void ntb_transport_link_work(void *arg);
+static int ntb_set_mw(struct ntb_transport_ctx *, int num_mw, size_t size);
+static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw);
+static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
+ unsigned int qp_num);
+static void ntb_qp_link_work(void *arg);
+static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt);
+static void ntb_transport_link_cleanup_work(void *, int);
+static void ntb_qp_link_down(struct ntb_transport_qp *qp);
+static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp);
+static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp);
+static void ntb_send_link_down(struct ntb_transport_qp *qp);
+static void ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
+ struct ntb_queue_list *list);
+static struct ntb_queue_entry *ntb_list_rm(struct mtx *lock,
+ struct ntb_queue_list *list);
+static struct ntb_queue_entry *ntb_list_mv(struct mtx *lock,
+ struct ntb_queue_list *from, struct ntb_queue_list *to);
+static void xeon_link_watchdog_hb(void *);
+
+static const struct ntb_ctx_ops ntb_transport_ops = {
+ .link_event = ntb_transport_event_callback,
+ .db_event = ntb_transport_doorbell_callback,
+};
+
+MALLOC_DEFINE(M_NTB_T, "ntb_transport", "ntb transport driver");
+
+static inline void
+iowrite32(uint32_t val, void *addr)
+{
+
+ bus_space_write_4(X86_BUS_SPACE_MEM, 0/* HACK */, (uintptr_t)addr,
+ val);
+}
+
+/* Transport Init and teardown */
+
+static void
+xeon_link_watchdog_hb(void *arg)
+{
+ struct ntb_transport_ctx *nt;
+
+ nt = arg;
+ ntb_spad_write(nt->dev, NTBT_WATCHDOG_SPAD, 0);
+ callout_reset(&nt->link_watchdog, 1 * hz, xeon_link_watchdog_hb, nt);
+}
+
+static int
+ntb_transport_probe(device_t dev)
+{
+
+ device_set_desc(dev, "NTB Transport");
+ return (0);
+}
+
+static int
+ntb_transport_attach(device_t dev)
+{
+ struct ntb_transport_ctx *nt = device_get_softc(dev);
+ struct ntb_transport_child **cpp = &nt->child;
+ struct ntb_transport_child *nc;
+ struct ntb_transport_mw *mw;
+ uint64_t db_bitmap;
+ int rc, i, db_count, spad_count, qp, qpu, qpo, qpt;
+ char cfg[128] = "";
+ char buf[32];
+ char *n, *np, *c, *name;
+
+ nt->dev = dev;
+ nt->mw_count = ntb_mw_count(dev);
+ spad_count = ntb_spad_count(dev);
+ db_bitmap = ntb_db_valid_mask(dev);
+ db_count = flsll(db_bitmap);
+ KASSERT(db_bitmap == (1 << db_count) - 1,
+ ("Doorbells are not sequential (%jx).\n", db_bitmap));
+
+ device_printf(dev, "%d memory windows, %d scratchpads, "
+ "%d doorbells\n", nt->mw_count, spad_count, db_count);
+
+ if (nt->mw_count == 0) {
+ device_printf(dev, "At least 1 memory window required.\n");
+ return (ENXIO);
+ }
+ if (spad_count < 6) {
+ device_printf(dev, "At least 6 scratchpads required.\n");
+ return (ENXIO);
+ }
+ if (spad_count < 4 + 2 * nt->mw_count) {
+ nt->mw_count = (spad_count - 4) / 2;
+ device_printf(dev, "Scratchpads enough only for %d "
+ "memory windows.\n", nt->mw_count);
+ }
+ if (db_bitmap == 0) {
+ device_printf(dev, "At least one doorbell required.\n");
+ return (ENXIO);
+ }
+
+ nt->mw_vec = malloc(nt->mw_count * sizeof(*nt->mw_vec), M_NTB_T,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < nt->mw_count; i++) {
+ mw = &nt->mw_vec[i];
+
+ rc = ntb_mw_get_range(dev, i, &mw->phys_addr, &mw->vbase,
+ &mw->phys_size, &mw->xlat_align, &mw->xlat_align_size,
+ &mw->addr_limit);
+ if (rc != 0)
+ goto err;
+
+ mw->buff_size = 0;
+ mw->xlat_size = 0;
+ mw->virt_addr = NULL;
+ mw->dma_addr = 0;
+
+ rc = ntb_mw_set_wc(dev, i, VM_MEMATTR_WRITE_COMBINING);
+ if (rc)
+ ntb_printf(0, "Unable to set mw%d caching\n", i);
+ }
+
+ qpu = 0;
+ qpo = imin(db_count, nt->mw_count);
+ qpt = db_count;
+
+ snprintf(buf, sizeof(buf), "hint.%s.%d.config", device_get_name(dev),
+ device_get_unit(dev));
+ TUNABLE_STR_FETCH(buf, cfg, sizeof(cfg));
+ n = cfg;
+ i = 0;
+ while ((c = strsep(&n, ",")) != NULL) {
+ np = c;
+ name = strsep(&np, ":");
+ if (name != NULL && name[0] == 0)
+ name = NULL;
+ qp = (np && np[0] != 0) ? strtol(np, NULL, 10) : qpo - qpu;
+ if (qp <= 0)
+ qp = 1;
+
+ if (qp > qpt - qpu) {
+ device_printf(dev, "Not enough resources for config\n");
+ break;
+ }
+
+ nc = malloc(sizeof(*nc), M_DEVBUF, M_WAITOK | M_ZERO);
+ nc->qpoff = qpu;
+ nc->qpcnt = qp;
+ nc->dev = device_add_child(dev, name, -1);
+ if (nc->dev == NULL) {
+ device_printf(dev, "Can not add child.\n");
+ break;
+ }
+ device_set_ivars(nc->dev, nc);
+ *cpp = nc;
+ cpp = &nc->next;
+
+ if (bootverbose) {
+ device_printf(dev, "%d \"%s\": queues %d",
+ i, name, qpu);
+ if (qp > 1)
+ printf("-%d", qpu + qp - 1);
+ printf("\n");
+ }
+
+ qpu += qp;
+ i++;
+ }
+ nt->qp_count = qpu;
+
+ nt->qp_vec = malloc(nt->qp_count * sizeof(*nt->qp_vec), M_NTB_T,
+ M_WAITOK | M_ZERO);
+
+ for (i = 0; i < nt->qp_count; i++)
+ ntb_transport_init_queue(nt, i);
+
+ callout_init(&nt->link_work, 0);
+ callout_init(&nt->link_watchdog, 0);
+ TASK_INIT(&nt->link_cleanup, 0, ntb_transport_link_cleanup_work, nt);
+
+ rc = ntb_set_ctx(dev, nt, &ntb_transport_ops);
+ if (rc != 0)
+ goto err;
+
+ nt->link_is_up = false;
+ ntb_link_enable(dev, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+
+ if (enable_xeon_watchdog != 0)
+ callout_reset(&nt->link_watchdog, 0, xeon_link_watchdog_hb, nt);
+
+ bus_generic_attach(dev);
+ return (0);
+
+err:
+ free(nt->qp_vec, M_NTB_T);
+ free(nt->mw_vec, M_NTB_T);
+ return (rc);
+}
+
+static int
+ntb_transport_detach(device_t dev)
+{
+ struct ntb_transport_ctx *nt = device_get_softc(dev);
+ struct ntb_transport_child **cpp = &nt->child;
+ struct ntb_transport_child *nc;
+ int error = 0, i;
+
+ while ((nc = *cpp) != NULL) {
+ *cpp = (*cpp)->next;
+ error = device_delete_child(dev, nc->dev);
+ if (error)
+ break;
+ free(nc, M_DEVBUF);
+ }
+ KASSERT(nt->qp_bitmap == 0,
+ ("Some queues not freed on detach (%jx)", nt->qp_bitmap));
+
+ ntb_transport_link_cleanup(nt);
+ taskqueue_drain(taskqueue_swi, &nt->link_cleanup);
+ callout_drain(&nt->link_work);
+ callout_drain(&nt->link_watchdog);
+
+ ntb_link_disable(dev);
+ ntb_clear_ctx(dev);
+
+ for (i = 0; i < nt->mw_count; i++)
+ ntb_free_mw(nt, i);
+
+ free(nt->qp_vec, M_NTB_T);
+ free(nt->mw_vec, M_NTB_T);
+ return (0);
+}
+
+int
+ntb_transport_queue_count(device_t dev)
+{
+ struct ntb_transport_child *nc = device_get_ivars(dev);
+
+ return (nc->qpcnt);
+}
+
+static void
+ntb_transport_init_queue(struct ntb_transport_ctx *nt, unsigned int qp_num)
+{
+ struct ntb_transport_mw *mw;
+ struct ntb_transport_qp *qp;
+ vm_paddr_t mw_base;
+ uint64_t mw_size, qp_offset;
+ size_t tx_size;
+ unsigned num_qps_mw, mw_num, mw_count;
+
+ mw_count = nt->mw_count;
+ mw_num = QP_TO_MW(nt, qp_num);
+ mw = &nt->mw_vec[mw_num];
+
+ qp = &nt->qp_vec[qp_num];
+ qp->qp_num = qp_num;
+ qp->transport = nt;
+ qp->dev = nt->dev;
+ qp->client_ready = false;
+ qp->event_handler = NULL;
+ ntb_qp_link_down_reset(qp);
+
+ if (mw_num < nt->qp_count % mw_count)
+ num_qps_mw = nt->qp_count / mw_count + 1;
+ else
+ num_qps_mw = nt->qp_count / mw_count;
+
+ mw_base = mw->phys_addr;
+ mw_size = mw->phys_size;
+
+ tx_size = mw_size / num_qps_mw;
+ qp_offset = tx_size * (qp_num / mw_count);
+
+ qp->tx_mw = mw->vbase + qp_offset;
+ KASSERT(qp->tx_mw != NULL, ("uh oh?"));
+
+ /* XXX Assumes that a vm_paddr_t is equivalent to bus_addr_t */
+ qp->tx_mw_phys = mw_base + qp_offset;
+ KASSERT(qp->tx_mw_phys != 0, ("uh oh?"));
+
+ tx_size -= sizeof(struct ntb_rx_info);
+ qp->rx_info = (void *)(qp->tx_mw + tx_size);
+
+ /* Due to house-keeping, there must be at least 2 buffs */
+ qp->tx_max_frame = qmin(transport_mtu, tx_size / 2);
+ qp->tx_max_entry = tx_size / qp->tx_max_frame;
+
+ callout_init(&qp->link_work, 0);
+ callout_init(&qp->rx_full, 1);
+
+ mtx_init(&qp->ntb_rx_q_lock, "ntb rx q", NULL, MTX_SPIN);
+ mtx_init(&qp->ntb_tx_free_q_lock, "ntb tx free q", NULL, MTX_SPIN);
+ mtx_init(&qp->tx_lock, "ntb transport tx", NULL, MTX_DEF);
+ TASK_INIT(&qp->rxc_db_work, 0, ntb_transport_rxc_db, qp);
+ qp->rxc_tq = taskqueue_create("ntbt_rx", M_WAITOK,
+ taskqueue_thread_enqueue, &qp->rxc_tq);
+ taskqueue_start_threads(&qp->rxc_tq, 1, PI_NET, "%s rx%d",
+ device_get_nameunit(nt->dev), qp_num);
+
+ STAILQ_INIT(&qp->rx_post_q);
+ STAILQ_INIT(&qp->rx_pend_q);
+ STAILQ_INIT(&qp->tx_free_q);
+}
+
+void
+ntb_transport_free_queue(struct ntb_transport_qp *qp)
+{
+ struct ntb_transport_ctx *nt = qp->transport;
+ struct ntb_queue_entry *entry;
+
+ callout_drain(&qp->link_work);
+
+ ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
+ taskqueue_drain_all(qp->rxc_tq);
+ taskqueue_free(qp->rxc_tq);
+
+ qp->cb_data = NULL;
+ qp->rx_handler = NULL;
+ qp->tx_handler = NULL;
+ qp->event_handler = NULL;
+
+ while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_pend_q)))
+ free(entry, M_NTB_T);
+
+ while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_post_q)))
+ free(entry, M_NTB_T);
+
+ while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
+ free(entry, M_NTB_T);
+
+ nt->qp_bitmap &= ~(1 << qp->qp_num);
+}
+
+/**
+ * ntb_transport_create_queue - Create a new NTB transport layer queue
+ * @rx_handler: receive callback function
+ * @tx_handler: transmit callback function
+ * @event_handler: event callback function
+ *
+ * Create a new NTB transport layer queue and provide the queue with a callback
+ * routine for both transmit and receive. The receive callback routine will be
+ * used to pass up data when the transport has received it on the queue. The
+ * transmit callback routine will be called when the transport has completed the
+ * transmission of the data on the queue and the data is ready to be freed.
+ *
+ * RETURNS: pointer to newly created ntb_queue, NULL on error.
+ */
+struct ntb_transport_qp *
+ntb_transport_create_queue(device_t dev, int q,
+ const struct ntb_queue_handlers *handlers, void *data)
+{
+ struct ntb_transport_child *nc = device_get_ivars(dev);
+ struct ntb_transport_ctx *nt = device_get_softc(device_get_parent(dev));
+ struct ntb_queue_entry *entry;
+ struct ntb_transport_qp *qp;
+ int i;
+
+ if (q < 0 || q >= nc->qpcnt)
+ return (NULL);
+
+ qp = &nt->qp_vec[nc->qpoff + q];
+ nt->qp_bitmap |= (1 << qp->qp_num);
+ qp->cb_data = data;
+ qp->rx_handler = handlers->rx_handler;
+ qp->tx_handler = handlers->tx_handler;
+ qp->event_handler = handlers->event_handler;
+
+ for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+ entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
+ entry->cb_data = data;
+ entry->buf = NULL;
+ entry->len = transport_mtu;
+ entry->qp = qp;
+ ntb_list_add(&qp->ntb_rx_q_lock, entry, &qp->rx_pend_q);
+ }
+
+ for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+ entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
+ entry->qp = qp;
+ ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+ }
+
+ ntb_db_clear(dev, 1ull << qp->qp_num);
+ return (qp);
+}
+
+/**
+ * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
+ * @qp: NTB transport layer queue to be enabled
+ *
+ * Notify NTB transport layer of client readiness to use queue
+ */
+void
+ntb_transport_link_up(struct ntb_transport_qp *qp)
+{
+ struct ntb_transport_ctx *nt = qp->transport;
+
+ qp->client_ready = true;
+
+ ntb_printf(2, "qp %d client ready\n", qp->qp_num);
+
+ if (nt->link_is_up)
+ callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+}
+
+
+
+/* Transport Tx */
+
+/**
+ * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
+ * @qp: NTB transport layer queue the entry is to be enqueued on
+ * @cb: per buffer pointer for callback function to use
+ * @data: pointer to data buffer that will be sent
+ * @len: length of the data buffer
+ *
+ * Enqueue a new transmit buffer onto the transport queue from which a NTB
+ * payload will be transmitted. This assumes that a lock is being held to
+ * serialize access to the qp.
+ *
+ * RETURNS: An appropriate ERRNO error value on error, or zero for success.
+ */
+int
+ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+ unsigned int len)
+{
+ struct ntb_queue_entry *entry;
+ int rc;
+
+ if (!qp->link_is_up || len == 0) {
+ CTR0(KTR_NTB, "TX: link not up");
+ return (EINVAL);
+ }
+
+ entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
+ if (entry == NULL) {
+ CTR0(KTR_NTB, "TX: could not get entry from tx_free_q");
+ qp->tx_err_no_buf++;
+ return (EBUSY);
+ }
+ CTR1(KTR_NTB, "TX: got entry %p from tx_free_q", entry);
+
+ entry->cb_data = cb;
+ entry->buf = data;
+ entry->len = len;
+ entry->flags = 0;
+
+ mtx_lock(&qp->tx_lock);
+ rc = ntb_process_tx(qp, entry);
+ mtx_unlock(&qp->tx_lock);
+ if (rc != 0) {
+ ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+ CTR1(KTR_NTB,
+ "TX: process_tx failed. Returning entry %p to tx_free_q",
+ entry);
+ }
+ return (rc);
+}
+
+static void
+ntb_tx_copy_callback(void *data)
+{
+ struct ntb_queue_entry *entry = data;
+ struct ntb_transport_qp *qp = entry->qp;
+ struct ntb_payload_header *hdr = entry->x_hdr;
+
+ iowrite32(entry->flags | NTBT_DESC_DONE_FLAG, &hdr->flags);
+ CTR1(KTR_NTB, "TX: hdr %p set DESC_DONE", hdr);
+
+ ntb_peer_db_set(qp->dev, 1ull << qp->qp_num);
+
+ /*
+ * The entry length can only be zero if the packet is intended to be a
+ * "link down" or similar. Since no payload is being sent in these
+ * cases, there is nothing to add to the completion queue.
+ */
+ if (entry->len > 0) {
+ qp->tx_bytes += entry->len;
+
+ if (qp->tx_handler)
+ qp->tx_handler(qp, qp->cb_data, entry->buf,
+ entry->len);
+ else
+ m_freem(entry->buf);
+ entry->buf = NULL;
+ }
+
+ CTR3(KTR_NTB,
+ "TX: entry %p sent. hdr->ver = %u, hdr->flags = 0x%x, Returning "
+ "to tx_free_q", entry, hdr->ver, hdr->flags);
+ ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+}
+
+static void
+ntb_memcpy_tx(struct ntb_queue_entry *entry, void *offset)
+{
+
+ CTR2(KTR_NTB, "TX: copying %d bytes to offset %p", entry->len, offset);
+ if (entry->buf != NULL) {
+ m_copydata((struct mbuf *)entry->buf, 0, entry->len, offset);
+
+ /*
+ * Ensure that the data is fully copied before setting the
+ * flags
+ */
+ wmb();
+ }
+
+ ntb_tx_copy_callback(entry);
+}
+
+static void
+ntb_async_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
+{
+ struct ntb_payload_header *hdr;
+ void *offset;
+
+ offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
+ hdr = (struct ntb_payload_header *)((char *)offset + qp->tx_max_frame -
+ sizeof(struct ntb_payload_header));
+ entry->x_hdr = hdr;
+
+ iowrite32(entry->len, &hdr->len);
+ iowrite32(qp->tx_pkts, &hdr->ver);
+
+ ntb_memcpy_tx(entry, offset);
+}
+
+static int
+ntb_process_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
+{
+
+ CTR3(KTR_NTB,
+ "TX: process_tx: tx_pkts=%lu, tx_index=%u, remote entry=%u",
+ qp->tx_pkts, qp->tx_index, qp->remote_rx_info->entry);
+ if (qp->tx_index == qp->remote_rx_info->entry) {
+ CTR0(KTR_NTB, "TX: ring full");
+ qp->tx_ring_full++;
+ return (EAGAIN);
+ }
+
+ if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
+ if (qp->tx_handler != NULL)
+ qp->tx_handler(qp, qp->cb_data, entry->buf,
+ EIO);
+ else
+ m_freem(entry->buf);
+
+ entry->buf = NULL;
+ ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
+ CTR1(KTR_NTB,
+ "TX: frame too big. returning entry %p to tx_free_q",
+ entry);
+ return (0);
+ }
+ CTR2(KTR_NTB, "TX: copying entry %p to index %u", entry, qp->tx_index);
+ ntb_async_tx(qp, entry);
+
+ qp->tx_index++;
+ qp->tx_index %= qp->tx_max_entry;
+
+ qp->tx_pkts++;
+
+ return (0);
+}
+
+/* Transport Rx */
+static void
+ntb_transport_rxc_db(void *arg, int pending __unused)
+{
+ struct ntb_transport_qp *qp = arg;
+ int rc;
+
+ CTR0(KTR_NTB, "RX: transport_rx");
+again:
+ while ((rc = ntb_process_rxc(qp)) == 0)
+ ;
+ CTR1(KTR_NTB, "RX: process_rxc returned %d", rc);
+
+ if ((ntb_db_read(qp->dev) & (1ull << qp->qp_num)) != 0) {
+ /* If db is set, clear it and check queue once more. */
+ ntb_db_clear(qp->dev, 1ull << qp->qp_num);
+ goto again;
+ }
+}
+
+static int
+ntb_process_rxc(struct ntb_transport_qp *qp)
+{
+ struct ntb_payload_header *hdr;
+ struct ntb_queue_entry *entry;
+ caddr_t offset;
+
+ offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
+ hdr = (void *)(offset + qp->rx_max_frame -
+ sizeof(struct ntb_payload_header));
+
+ CTR1(KTR_NTB, "RX: process_rxc rx_index = %u", qp->rx_index);
+ if ((hdr->flags & NTBT_DESC_DONE_FLAG) == 0) {
+ CTR0(KTR_NTB, "RX: hdr not done");
+ qp->rx_ring_empty++;
+ return (EAGAIN);
+ }
+
+ if ((hdr->flags & NTBT_LINK_DOWN_FLAG) != 0) {
+ CTR0(KTR_NTB, "RX: link down");
+ ntb_qp_link_down(qp);
+ hdr->flags = 0;
+ return (EAGAIN);
+ }
+
+ if (hdr->ver != (uint32_t)qp->rx_pkts) {
+ CTR2(KTR_NTB,"RX: ver != rx_pkts (%x != %lx). "
+ "Returning entry to rx_pend_q", hdr->ver, qp->rx_pkts);
+ qp->rx_err_ver++;
+ return (EIO);
+ }
+
+ entry = ntb_list_mv(&qp->ntb_rx_q_lock, &qp->rx_pend_q, &qp->rx_post_q);
+ if (entry == NULL) {
+ qp->rx_err_no_buf++;
+ CTR0(KTR_NTB, "RX: No entries in rx_pend_q");
+ return (EAGAIN);
+ }
+ callout_stop(&qp->rx_full);
+ CTR1(KTR_NTB, "RX: rx entry %p from rx_pend_q", entry);
+
+ entry->x_hdr = hdr;
+ entry->index = qp->rx_index;
+
+ if (hdr->len > entry->len) {
+ CTR2(KTR_NTB, "RX: len too long. Wanted %ju got %ju",
+ (uintmax_t)hdr->len, (uintmax_t)entry->len);
+ qp->rx_err_oflow++;
+
+ entry->len = -EIO;
+ entry->flags |= NTBT_DESC_DONE_FLAG;
+
+ ntb_complete_rxc(qp);
+ } else {
+ qp->rx_bytes += hdr->len;
+ qp->rx_pkts++;
+
+ CTR1(KTR_NTB, "RX: received %ld rx_pkts", qp->rx_pkts);
+
+ entry->len = hdr->len;
+
+ ntb_memcpy_rx(qp, entry, offset);
+ }
+
+ qp->rx_index++;
+ qp->rx_index %= qp->rx_max_entry;
+ return (0);
+}
+
+static void
+ntb_memcpy_rx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
+ void *offset)
+{
+ struct ifnet *ifp = entry->cb_data;
+ unsigned int len = entry->len;
+
+ CTR2(KTR_NTB, "RX: copying %d bytes from offset %p", len, offset);
+
+ entry->buf = (void *)m_devget(offset, len, 0, ifp, NULL);
+ if (entry->buf == NULL)
+ entry->len = -ENOMEM;
+
+ /* Ensure that the data is globally visible before clearing the flag */
+ wmb();
+
+ CTR2(KTR_NTB, "RX: copied entry %p to mbuf %p.", entry, entry->buf);
+ ntb_rx_copy_callback(qp, entry);
+}
+
+static inline void
+ntb_rx_copy_callback(struct ntb_transport_qp *qp, void *data)
+{
+ struct ntb_queue_entry *entry;
+
+ entry = data;
+ entry->flags |= NTBT_DESC_DONE_FLAG;
+ ntb_complete_rxc(qp);
+}
+
+static void
+ntb_complete_rxc(struct ntb_transport_qp *qp)
+{
+ struct ntb_queue_entry *entry;
+ struct mbuf *m;
+ unsigned len;
+
+ CTR0(KTR_NTB, "RX: rx_completion_task");
+
+ mtx_lock_spin(&qp->ntb_rx_q_lock);
+
+ while (!STAILQ_EMPTY(&qp->rx_post_q)) {
+ entry = STAILQ_FIRST(&qp->rx_post_q);
+ if ((entry->flags & NTBT_DESC_DONE_FLAG) == 0)
+ break;
+
+ entry->x_hdr->flags = 0;
+ iowrite32(entry->index, &qp->rx_info->entry);
+
+ STAILQ_REMOVE_HEAD(&qp->rx_post_q, entry);
+
+ len = entry->len;
+ m = entry->buf;
+
+ /*
+ * Re-initialize queue_entry for reuse; rx_handler takes
+ * ownership of the mbuf.
+ */
+ entry->buf = NULL;
+ entry->len = transport_mtu;
+ entry->cb_data = qp->cb_data;
+
+ STAILQ_INSERT_TAIL(&qp->rx_pend_q, entry, entry);
+
+ mtx_unlock_spin(&qp->ntb_rx_q_lock);
+
+ CTR2(KTR_NTB, "RX: completing entry %p, mbuf %p", entry, m);
+ if (qp->rx_handler != NULL && qp->client_ready)
+ qp->rx_handler(qp, qp->cb_data, m, len);
+ else
+ m_freem(m);
+
+ mtx_lock_spin(&qp->ntb_rx_q_lock);
+ }
+
+ mtx_unlock_spin(&qp->ntb_rx_q_lock);
+}
+
+static void
+ntb_transport_doorbell_callback(void *data, uint32_t vector)
+{
+ struct ntb_transport_ctx *nt = data;
+ struct ntb_transport_qp *qp;
+ uint64_t vec_mask;
+ unsigned qp_num;
+
+ vec_mask = ntb_db_vector_mask(nt->dev, vector);
+ vec_mask &= nt->qp_bitmap;
+ if ((vec_mask & (vec_mask - 1)) != 0)
+ vec_mask &= ntb_db_read(nt->dev);
+ while (vec_mask != 0) {
+ qp_num = ffsll(vec_mask) - 1;
+
+ qp = &nt->qp_vec[qp_num];
+ if (qp->link_is_up)
+ taskqueue_enqueue(qp->rxc_tq, &qp->rxc_db_work);
+
+ vec_mask &= ~(1ull << qp_num);
+ }
+}
+
+/* Link Event handler */
+static void
+ntb_transport_event_callback(void *data)
+{
+ struct ntb_transport_ctx *nt = data;
+
+ if (ntb_link_is_up(nt->dev, NULL, NULL)) {
+ ntb_printf(1, "HW link up\n");
+ callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
+ } else {
+ ntb_printf(1, "HW link down\n");
+ taskqueue_enqueue(taskqueue_swi, &nt->link_cleanup);
+ }
+}
+
+/* Link bring up */
+static void
+ntb_transport_link_work(void *arg)
+{
+ struct ntb_transport_ctx *nt = arg;
+ device_t dev = nt->dev;
+ struct ntb_transport_qp *qp;
+ uint64_t val64, size;
+ uint32_t val;
+ unsigned i;
+ int rc;
+
+ /* send the local info, in the opposite order of the way we read it */
+ for (i = 0; i < nt->mw_count; i++) {
+ size = nt->mw_vec[i].phys_size;
+
+ if (max_mw_size != 0 && size > max_mw_size)
+ size = max_mw_size;
+
+ ntb_peer_spad_write(dev, NTBT_MW0_SZ_HIGH + (i * 2),
+ size >> 32);
+ ntb_peer_spad_write(dev, NTBT_MW0_SZ_LOW + (i * 2), size);
+ }
+ ntb_peer_spad_write(dev, NTBT_NUM_MWS, nt->mw_count);
+ ntb_peer_spad_write(dev, NTBT_NUM_QPS, nt->qp_count);
+ ntb_peer_spad_write(dev, NTBT_QP_LINKS, 0);
+ ntb_peer_spad_write(dev, NTBT_VERSION, NTB_TRANSPORT_VERSION);
+
+ /* Query the remote side for its info */
+ val = 0;
+ ntb_spad_read(dev, NTBT_VERSION, &val);
+ if (val != NTB_TRANSPORT_VERSION)
+ goto out;
+
+ ntb_spad_read(dev, NTBT_NUM_QPS, &val);
+ if (val != nt->qp_count)
+ goto out;
+
+ ntb_spad_read(dev, NTBT_NUM_MWS, &val);
+ if (val != nt->mw_count)
+ goto out;
+
+ for (i = 0; i < nt->mw_count; i++) {
+ ntb_spad_read(dev, NTBT_MW0_SZ_HIGH + (i * 2), &val);
+ val64 = (uint64_t)val << 32;
+
+ ntb_spad_read(dev, NTBT_MW0_SZ_LOW + (i * 2), &val);
+ val64 |= val;
+
+ rc = ntb_set_mw(nt, i, val64);
+ if (rc != 0)
+ goto free_mws;
+ }
+
+ nt->link_is_up = true;
+ ntb_printf(1, "transport link up\n");
+
+ for (i = 0; i < nt->qp_count; i++) {
+ qp = &nt->qp_vec[i];
+
+ ntb_transport_setup_qp_mw(nt, i);
+
+ if (qp->client_ready)
+ callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
+ }
+
+ return;
+
+free_mws:
+ for (i = 0; i < nt->mw_count; i++)
+ ntb_free_mw(nt, i);
+out:
+ if (ntb_link_is_up(dev, NULL, NULL))
+ callout_reset(&nt->link_work,
+ NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_transport_link_work, nt);
+}
+
+static int
+ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, size_t size)
+{
+ struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+ size_t xlat_size, buff_size;
+ int rc;
+
+ if (size == 0)
+ return (EINVAL);
+
+ xlat_size = roundup(size, mw->xlat_align_size);
+ buff_size = xlat_size;
+
+ /* No need to re-setup */
+ if (mw->xlat_size == xlat_size)
+ return (0);
+
+ if (mw->buff_size != 0)
+ ntb_free_mw(nt, num_mw);
+
+ /* Alloc memory for receiving data. Must be aligned */
+ mw->xlat_size = xlat_size;
+ mw->buff_size = buff_size;
+
+ mw->virt_addr = contigmalloc(mw->buff_size, M_NTB_T, M_ZERO, 0,
+ mw->addr_limit, mw->xlat_align, 0);
+ if (mw->virt_addr == NULL) {
+ ntb_printf(0, "Unable to allocate MW buffer of size %zu/%zu\n",
+ mw->buff_size, mw->xlat_size);
+ mw->xlat_size = 0;
+ mw->buff_size = 0;
+ return (ENOMEM);
+ }
+ /* TODO: replace with bus_space_* functions */
+ mw->dma_addr = vtophys(mw->virt_addr);
+
+ /*
+ * Ensure that the allocation from contigmalloc is aligned as
+ * requested. XXX: This may not be needed -- brought in for parity
+ * with the Linux driver.
+ */
+ if (mw->dma_addr % mw->xlat_align != 0) {
+ ntb_printf(0,
+ "DMA memory 0x%jx not aligned to BAR size 0x%zx\n",
+ (uintmax_t)mw->dma_addr, size);
+ ntb_free_mw(nt, num_mw);
+ return (ENOMEM);
+ }
+
+ /* Notify HW the memory location of the receive buffer */
+ rc = ntb_mw_set_trans(nt->dev, num_mw, mw->dma_addr, mw->xlat_size);
+ if (rc) {
+ ntb_printf(0, "Unable to set mw%d translation\n", num_mw);
+ ntb_free_mw(nt, num_mw);
+ return (rc);
+ }
+
+ return (0);
+}
+
+static void
+ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
+{
+ struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+
+ if (mw->virt_addr == NULL)
+ return;
+
+ ntb_mw_clear_trans(nt->dev, num_mw);
+ contigfree(mw->virt_addr, mw->xlat_size, M_NTB_T);
+ mw->xlat_size = 0;
+ mw->buff_size = 0;
+ mw->virt_addr = NULL;
+}
+
+static int
+ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, unsigned int qp_num)
+{
+ struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
+ struct ntb_transport_mw *mw;
+ void *offset;
+ ntb_q_idx_t i;
+ size_t rx_size;
+ unsigned num_qps_mw, mw_num, mw_count;
+
+ mw_count = nt->mw_count;
+ mw_num = QP_TO_MW(nt, qp_num);
+ mw = &nt->mw_vec[mw_num];
+
+ if (mw->virt_addr == NULL)
+ return (ENOMEM);
+
+ if (mw_num < nt->qp_count % mw_count)
+ num_qps_mw = nt->qp_count / mw_count + 1;
+ else
+ num_qps_mw = nt->qp_count / mw_count;
+
+ rx_size = mw->xlat_size / num_qps_mw;
+ qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
+ rx_size -= sizeof(struct ntb_rx_info);
+
+ qp->remote_rx_info = (void*)(qp->rx_buff + rx_size);
+
+ /* Due to house-keeping, there must be at least 2 buffs */
+ qp->rx_max_frame = qmin(transport_mtu, rx_size / 2);
+ qp->rx_max_entry = rx_size / qp->rx_max_frame;
+ qp->rx_index = 0;
+
+ qp->remote_rx_info->entry = qp->rx_max_entry - 1;
+
+ /* Set up the hdr offsets with 0s */
+ for (i = 0; i < qp->rx_max_entry; i++) {
+ offset = (void *)(qp->rx_buff + qp->rx_max_frame * (i + 1) -
+ sizeof(struct ntb_payload_header));
+ memset(offset, 0, sizeof(struct ntb_payload_header));
+ }
+
+ qp->rx_pkts = 0;
+ qp->tx_pkts = 0;
+ qp->tx_index = 0;
+
+ return (0);
+}
+
+static void
+ntb_qp_link_work(void *arg)
+{
+ struct ntb_transport_qp *qp = arg;
+ device_t dev = qp->dev;
+ struct ntb_transport_ctx *nt = qp->transport;
+ int i;
+ uint32_t val;
+
+ /* Report queues that are up on our side */
+ for (i = 0, val = 0; i < nt->qp_count; i++) {
+ if (nt->qp_vec[i].client_ready)
+ val |= (1 << i);
+ }
+ ntb_peer_spad_write(dev, NTBT_QP_LINKS, val);
+
+ /* See if the remote side is up */
+ ntb_spad_read(dev, NTBT_QP_LINKS, &val);
+ if ((val & (1ull << qp->qp_num)) != 0) {
+ ntb_printf(2, "qp %d link up\n", qp->qp_num);
+ qp->link_is_up = true;
+
+ if (qp->event_handler != NULL)
+ qp->event_handler(qp->cb_data, NTB_LINK_UP);
+
+ ntb_db_clear_mask(dev, 1ull << qp->qp_num);
+ } else if (nt->link_is_up)
+ callout_reset(&qp->link_work,
+ NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
+}
+
+/* Link down event*/
+static void
+ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
+{
+ struct ntb_transport_qp *qp;
+ int i;
+
+ /* Pass along the info to any clients */
+ for (i = 0; i < nt->qp_count; i++) {
+ if ((nt->qp_bitmap & (1 << i)) != 0) {
+ qp = &nt->qp_vec[i];
+ ntb_qp_link_cleanup(qp);
+ callout_drain(&qp->link_work);
+ }
+ }
+
+ if (!nt->link_is_up)
+ callout_drain(&nt->link_work);
+
+ /*
+ * The scratchpad registers keep the values if the remote side
+ * goes down, blast them now to give them a sane value the next
+ * time they are accessed
+ */
+ ntb_spad_clear(nt->dev);
+}
+
+static void
+ntb_transport_link_cleanup_work(void *arg, int pending __unused)
+{
+
+ ntb_transport_link_cleanup(arg);
+}
+
+static void
+ntb_qp_link_down(struct ntb_transport_qp *qp)
+{
+
+ ntb_qp_link_cleanup(qp);
+}
+
+static void
+ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
+{
+
+ qp->link_is_up = false;
+ ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
+
+ qp->tx_index = qp->rx_index = 0;
+ qp->tx_bytes = qp->rx_bytes = 0;
+ qp->tx_pkts = qp->rx_pkts = 0;
+
+ qp->rx_ring_empty = 0;
+ qp->tx_ring_full = 0;
+
+ qp->rx_err_no_buf = qp->tx_err_no_buf = 0;
+ qp->rx_err_oflow = qp->rx_err_ver = 0;
+}
+
+static void
+ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
+{
+
+ callout_drain(&qp->link_work);
+ ntb_qp_link_down_reset(qp);
+
+ if (qp->event_handler != NULL)
+ qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
+}
+
+/* Link commanded down */
+/**
+ * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
+ * @qp: NTB transport layer queue to be disabled
+ *
+ * Notify NTB transport layer of client's desire to no longer receive data on
+ * transport queue specified. It is the client's responsibility to ensure all
+ * entries on queue are purged or otherwise handled appropriately.
+ */
+void
+ntb_transport_link_down(struct ntb_transport_qp *qp)
+{
+ struct ntb_transport_ctx *nt = qp->transport;
+ int i;
+ uint32_t val;
+
+ qp->client_ready = false;
+ for (i = 0, val = 0; i < nt->qp_count; i++) {
+ if (nt->qp_vec[i].client_ready)
+ val |= (1 << i);
+ }
+ ntb_peer_spad_write(qp->dev, NTBT_QP_LINKS, val);
+
+ if (qp->link_is_up)
+ ntb_send_link_down(qp);
+ else
+ callout_drain(&qp->link_work);
+}
+
+/**
+ * ntb_transport_link_query - Query transport link state
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query connectivity to the remote system of the NTB transport queue
+ *
+ * RETURNS: true for link up or false for link down
+ */
+bool
+ntb_transport_link_query(struct ntb_transport_qp *qp)
+{
+
+ return (qp->link_is_up);
+}
+
+static void
+ntb_send_link_down(struct ntb_transport_qp *qp)
+{
+ struct ntb_queue_entry *entry;
+ int i, rc;
+
+ if (!qp->link_is_up)
+ return;
+
+ for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
+ entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
+ if (entry != NULL)
+ break;
+ pause("NTB Wait for link down", hz / 10);
+ }
+
+ if (entry == NULL)
+ return;
+
+ entry->cb_data = NULL;
+ entry->buf = NULL;
+ entry->len = 0;
+ entry->flags = NTBT_LINK_DOWN_FLAG;
+
+ mtx_lock(&qp->tx_lock);
+ rc = ntb_process_tx(qp, entry);
+ mtx_unlock(&qp->tx_lock);
+ if (rc != 0)
+ printf("ntb: Failed to send link down\n");
+
+ ntb_qp_link_down_reset(qp);
+}
+
+
+/* List Management */
+
+static void
+ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
+ struct ntb_queue_list *list)
+{
+
+ mtx_lock_spin(lock);
+ STAILQ_INSERT_TAIL(list, entry, entry);
+ mtx_unlock_spin(lock);
+}
+
+static struct ntb_queue_entry *
+ntb_list_rm(struct mtx *lock, struct ntb_queue_list *list)
+{
+ struct ntb_queue_entry *entry;
+
+ mtx_lock_spin(lock);
+ if (STAILQ_EMPTY(list)) {
+ entry = NULL;
+ goto out;
+ }
+ entry = STAILQ_FIRST(list);
+ STAILQ_REMOVE_HEAD(list, entry);
+out:
+ mtx_unlock_spin(lock);
+
+ return (entry);
+}
+
+static struct ntb_queue_entry *
+ntb_list_mv(struct mtx *lock, struct ntb_queue_list *from,
+ struct ntb_queue_list *to)
+{
+ struct ntb_queue_entry *entry;
+
+ mtx_lock_spin(lock);
+ if (STAILQ_EMPTY(from)) {
+ entry = NULL;
+ goto out;
+ }
+ entry = STAILQ_FIRST(from);
+ STAILQ_REMOVE_HEAD(from, entry);
+ STAILQ_INSERT_TAIL(to, entry, entry);
+
+out:
+ mtx_unlock_spin(lock);
+ return (entry);
+}
+
+/**
+ * ntb_transport_qp_num - Query the qp number
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query qp number of the NTB transport queue
+ *
+ * RETURNS: a zero based number specifying the qp number
+ */
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp)
+{
+
+ return (qp->qp_num);
+}
+
+/**
+ * ntb_transport_max_size - Query the max payload size of a qp
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query the maximum payload size permissible on the given qp
+ *
+ * RETURNS: the max payload size of a qp
+ */
+unsigned int
+ntb_transport_max_size(struct ntb_transport_qp *qp)
+{
+
+ return (qp->tx_max_frame - sizeof(struct ntb_payload_header));
+}
+
+unsigned int
+ntb_transport_tx_free_entry(struct ntb_transport_qp *qp)
+{
+ unsigned int head = qp->tx_index;
+ unsigned int tail = qp->remote_rx_info->entry;
+
+ return (tail >= head ? tail - head : qp->tx_max_entry + tail - head);
+}
+
+static device_method_t ntb_transport_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, ntb_transport_probe),
+ DEVMETHOD(device_attach, ntb_transport_attach),
+ DEVMETHOD(device_detach, ntb_transport_detach),
+ DEVMETHOD_END
+};
+
+devclass_t ntb_transport_devclass;
+static DEFINE_CLASS_0(ntb_transport, ntb_transport_driver,
+ ntb_transport_methods, sizeof(struct ntb_transport_ctx));
+DRIVER_MODULE(ntb_transport, ntb_hw, ntb_transport_driver,
+ ntb_transport_devclass, NULL, NULL);
+MODULE_DEPEND(ntb_transport, ntb, 1, 1, 1);
+MODULE_VERSION(ntb_transport, 1);
diff --git a/sys/dev/ntb/ntb_transport.h b/sys/dev/ntb/ntb_transport.h
new file mode 100644
index 0000000..63cdbce
--- /dev/null
+++ b/sys/dev/ntb/ntb_transport.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+struct ntb_transport_qp;
+
+extern devclass_t ntb_transport_devclass;
+
+enum ntb_link_event {
+ NTB_LINK_DOWN = 0,
+ NTB_LINK_UP,
+};
+
+struct ntb_queue_handlers {
+ void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+ void *data, int len);
+ void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+ void *data, int len);
+ void (*event_handler)(void *data, enum ntb_link_event status);
+};
+
+int ntb_transport_queue_count(device_t dev);
+struct ntb_transport_qp *
+ntb_transport_create_queue(device_t dev, int q,
+ const struct ntb_queue_handlers *handlers, void *data);
+void ntb_transport_free_queue(struct ntb_transport_qp *qp);
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
+int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+ unsigned int len);
+int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+ unsigned int len);
+void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
+void ntb_transport_link_up(struct ntb_transport_qp *qp);
+void ntb_transport_link_down(struct ntb_transport_qp *qp);
+bool ntb_transport_link_query(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp);
diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index 4ceb075..d70aaad 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -1744,7 +1744,7 @@ pci_remap_msix_method(device_t dev, device_t child, int count,
for (i = 0; i < count; i++) {
if (vectors[i] == 0)
continue;
- irq = msix->msix_vectors[vectors[i]].mv_irq;
+ irq = msix->msix_vectors[vectors[i] - 1].mv_irq;
resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq,
irq, 1);
}
@@ -1758,7 +1758,7 @@ pci_remap_msix_method(device_t dev, device_t child, int count,
printf("---");
else
printf("%d",
- msix->msix_vectors[vectors[i]].mv_irq);
+ msix->msix_vectors[vectors[i] - 1].mv_irq);
}
printf("\n");
}
diff --git a/sys/dev/sfxge/sfxge_ev.c b/sys/dev/sfxge/sfxge_ev.c
index d5aff5f..06ffed2 100644
--- a/sys/dev/sfxge/sfxge_ev.c
+++ b/sys/dev/sfxge/sfxge_ev.c
@@ -448,7 +448,7 @@ sfxge_ev_stat_update(struct sfxge_softc *sc)
goto out;
now = ticks;
- if (now - sc->ev_stats_update_time < hz)
+ if ((unsigned int)(now - sc->ev_stats_update_time) < (unsigned int)hz)
goto out;
sc->ev_stats_update_time = now;
diff --git a/sys/dev/sfxge/sfxge_port.c b/sys/dev/sfxge/sfxge_port.c
index 709ed78..a4f671f 100644
--- a/sys/dev/sfxge/sfxge_port.c
+++ b/sys/dev/sfxge/sfxge_port.c
@@ -62,7 +62,7 @@ sfxge_mac_stat_update(struct sfxge_softc *sc)
}
now = ticks;
- if (now - port->mac_stats.update_time < hz) {
+ if ((unsigned int)(now - port->mac_stats.update_time) < (unsigned int)hz) {
rc = 0;
goto out;
}
@@ -543,7 +543,7 @@ sfxge_phy_stat_update(struct sfxge_softc *sc)
}
now = ticks;
- if (now - port->phy_stats.update_time < hz) {
+ if ((unsigned int)(now - port->phy_stats.update_time) < (unsigned int)hz) {
rc = 0;
goto out;
}
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 4d96840..8fa6bcd 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -728,6 +728,7 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
if (flags & RFPPWAIT) {
td->td_pflags |= TDP_RFPPWAIT;
td->td_rfppwait_p = p2;
+ td->td_dbgflags |= TDB_VFORK;
}
PROC_UNLOCK(p2);
if ((flags & RFSTOPPED) == 0) {
@@ -1063,7 +1064,7 @@ fork_return(struct thread *td, struct trapframe *frame)
* parent's children, do it now.
*/
dbg = p->p_pptr->p_pptr;
- proc_set_traced(p);
+ proc_set_traced(p, true);
CTR2(KTR_PTRACE,
"fork_return: attaching to new child pid %d: oppid %d",
p->p_pid, p->p_oppid);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 2c37d76..75121b5 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -2510,7 +2510,7 @@ ptracestop(struct thread *td, int sig)
* a chance to report itself upon the next iteration.
*/
if ((td->td_dbgflags & TDB_FSTP) != 0 ||
- ((p->p_flag & P2_PTRACE_FSTP) == 0 &&
+ ((p->p_flag2 & P2_PTRACE_FSTP) == 0 &&
p->p_xthread == NULL)) {
p->p_xstat = sig;
p->p_xthread = td;
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
index f2b83f0..201d876 100644
--- a/sys/kern/subr_syscall.c
+++ b/sys/kern/subr_syscall.c
@@ -249,5 +249,13 @@ again:
cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz);
}
PROC_UNLOCK(p2);
+
+ if (td->td_dbgflags & TDB_VFORK) {
+ PROC_LOCK(p);
+ if (p->p_ptevents & PTRACE_VFORK)
+ ptracestop(td, SIGTRAP);
+ td->td_dbgflags &= ~TDB_VFORK;
+ PROC_UNLOCK(p);
+ }
}
}
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index c4533ce..b2dbf72 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -649,12 +649,13 @@ sys_ptrace(struct thread *td, struct ptrace_args *uap)
#endif
void
-proc_set_traced(struct proc *p)
+proc_set_traced(struct proc *p, bool stop)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_flag |= P_TRACED;
- p->p_flag2 |= P2_PTRACE_FSTP;
+ if (stop)
+ p->p_flag2 |= P2_PTRACE_FSTP;
p->p_ptevents = PTRACE_DEFAULT;
p->p_oppid = p->p_pptr->p_pid;
}
@@ -867,7 +868,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
switch (req) {
case PT_TRACE_ME:
/* set my trace flag and "owner" so it can read/write me */
- proc_set_traced(p);
+ proc_set_traced(p, false);
if (p->p_flag & P_PPWAIT)
p->p_flag |= P_PPTRACE;
CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid);
@@ -884,7 +885,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
* The old parent is remembered so we can put things back
* on a "detach".
*/
- proc_set_traced(p);
+ proc_set_traced(p, true);
if (p->p_pptr != td->td_proc) {
proc_reparent(p, td->td_proc);
}
@@ -957,7 +958,7 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
}
tmp = *(int *)addr;
if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX |
- PTRACE_FORK | PTRACE_LWP)) != 0) {
+ PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) {
error = EINVAL;
break;
}
@@ -1296,7 +1297,11 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
if (td2->td_dbgflags & TDB_FORK) {
pl->pl_flags |= PL_FLAG_FORKED;
pl->pl_child_pid = td2->td_dbg_forked;
- }
+ if (td2->td_dbgflags & TDB_VFORK)
+ pl->pl_flags |= PL_FLAG_VFORKED;
+ } else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) ==
+ TDB_VFORK)
+ pl->pl_flags |= PL_FLAG_VFORK_DONE;
if (td2->td_dbgflags & TDB_CHILD)
pl->pl_flags |= PL_FLAG_CHILD;
if (td2->td_dbgflags & TDB_BORN)
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 89b7a00..0fa87f9 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -1582,7 +1582,7 @@ static struct aiocb_ops aiocb_ops_osigevent = {
*/
int
aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
- int type, struct aiocb_ops *ops)
+ int type, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
cap_rights_t rights;
@@ -2568,14 +2568,9 @@ static int
kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
struct aiocb_ops *ops)
{
- struct proc *p = td->td_proc;
- struct kaioinfo *ki;
if (op != O_SYNC) /* XXX lack of O_DSYNC */
return (EINVAL);
- ki = p->p_aioinfo;
- if (ki == NULL)
- aio_init_aioinfo(p);
return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
}
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 94b8149..166ed65 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -635,7 +635,6 @@ int
vop_stdfsync(ap)
struct vop_fsync_args /* {
struct vnode *a_vp;
- struct ucred *a_cred;
int a_waitfor;
struct thread *a_td;
} */ *ap;
diff --git a/sys/modules/ntb/Makefile b/sys/modules/ntb/Makefile
index a5169a0..3eaf751 100644
--- a/sys/modules/ntb/Makefile
+++ b/sys/modules/ntb/Makefile
@@ -1,5 +1,5 @@
# $FreeBSD$
-SUBDIR= ntb_hw if_ntb
+SUBDIR= ntb ntb_hw ntb_transport if_ntb
.include <bsd.subdir.mk>
diff --git a/sys/modules/ntb/ntb/Makefile b/sys/modules/ntb/ntb/Makefile
new file mode 100644
index 0000000..a343f28
--- /dev/null
+++ b/sys/modules/ntb/ntb/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../../dev/ntb
+
+KMOD = ntb
+SRCS = ntb.c ntb_if.c
+SRCS += device_if.h bus_if.h ntb_if.h
+
+MFILES= kern/bus_if.m kern/device_if.m dev/ntb/ntb_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/ntb/ntb_hw/Makefile b/sys/modules/ntb/ntb_hw/Makefile
index fc46b46..5240411 100644
--- a/sys/modules/ntb/ntb_hw/Makefile
+++ b/sys/modules/ntb/ntb_hw/Makefile
@@ -4,6 +4,8 @@
KMOD = ntb_hw
SRCS = ntb_hw.c
-SRCS += device_if.h bus_if.h pci_if.h
+SRCS += device_if.h bus_if.h pci_if.h ntb_if.h
+
+MFILES= kern/bus_if.m kern/device_if.m dev/pci/pci_if.m dev/ntb/ntb_if.m
.include <bsd.kmod.mk>
diff --git a/sys/modules/ntb/ntb_transport/Makefile b/sys/modules/ntb/ntb_transport/Makefile
new file mode 100644
index 0000000..5055600
--- /dev/null
+++ b/sys/modules/ntb/ntb_transport/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../../dev/ntb
+
+KMOD = ntb_transport
+SRCS = ntb_transport.c
+SRCS += device_if.h bus_if.h ntb_if.h
+
+MFILES= kern/bus_if.m kern/device_if.m dev/ntb/ntb_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index 14d9967..57aadc0 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -165,7 +165,8 @@ __FBSDID("$FreeBSD$");
/*
* List of capabilities to possibly mask on the member interface.
*/
-#define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM)
+#define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM|\
+ IFCAP_TXCSUM_IPV6)
/*
* List of capabilities to strip
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index a1ceaab..8c56a13 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -2758,8 +2758,8 @@ pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
switch (af) {
#ifdef INET
case AF_INET:
- if ((a->addr32[0] < b->addr32[0]) ||
- (a->addr32[0] > e->addr32[0]))
+ if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
+ (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
return (0);
break;
#endif /* INET */
@@ -2769,15 +2769,15 @@ pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
/* check a >= b */
for (i = 0; i < 4; ++i)
- if (a->addr32[i] > b->addr32[i])
+ if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
break;
- else if (a->addr32[i] < b->addr32[i])
+ else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
return (0);
/* check a <= e */
for (i = 0; i < 4; ++i)
- if (a->addr32[i] < e->addr32[i])
+ if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
break;
- else if (a->addr32[i] > e->addr32[i])
+ else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
return (0);
break;
}
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 4b6c601..07f69c6 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -58,7 +58,7 @@
* in the range 5 to 9.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 1003505 /* Master, propagated to newvers */
+#define __FreeBSD_version 1003506 /* Master, propagated to newvers */
/*
* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 1b8bda5..59c75c5 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -398,6 +398,7 @@ do { \
#define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */
#define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */
#define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */
+#define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */
#define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */
/*
@@ -563,7 +564,7 @@ struct proc {
u_int p_magic; /* (b) Magic number. */
int p_osrel; /* (x) osreldate for the
binary (from ELF note, if any) */
- char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */
+ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */
void *p_pad0;
struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */
struct pargs *p_args; /* (c) Process arguments. */
@@ -932,7 +933,7 @@ void proc_linkup(struct proc *p, struct thread *td);
struct proc *proc_realparent(struct proc *child);
void proc_reap(struct thread *td, struct proc *p, int *status, int options);
void proc_reparent(struct proc *child, struct proc *newparent);
-void proc_set_traced(struct proc *p);
+void proc_set_traced(struct proc *p, bool stop);
struct pstats *pstats_alloc(void);
void pstats_fork(struct pstats *src, struct pstats *dst);
void pstats_free(struct pstats *ps);
diff --git a/sys/sys/ptrace.h b/sys/sys/ptrace.h
index e2b6a5f..f5f1db2 100644
--- a/sys/sys/ptrace.h
+++ b/sys/sys/ptrace.h
@@ -89,6 +89,7 @@
#define PTRACE_SYSCALL (PTRACE_SCE | PTRACE_SCX)
#define PTRACE_FORK 0x0008
#define PTRACE_LWP 0x0010
+#define PTRACE_VFORK 0x0020
#define PTRACE_DEFAULT (PTRACE_EXEC)
@@ -124,6 +125,8 @@ struct ptrace_lwpinfo {
#define PL_FLAG_CHILD 0x80 /* I am from child */
#define PL_FLAG_BORN 0x100 /* new LWP */
#define PL_FLAG_EXITED 0x200 /* exiting LWP */
+#define PL_FLAG_VFORKED 0x400 /* new child via vfork */
+#define PL_FLAG_VFORK_DONE 0x800 /* vfork parent has resumed */
sigset_t pl_sigmask; /* LWP signal mask */
sigset_t pl_siglist; /* LWP pending signal */
struct __siginfo pl_siginfo; /* siginfo for signal */
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index 8551085..04e0ae9 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -255,6 +255,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
}
pref = newb + fs->fs_frag;
nb = newb;
+ MPASS(allocblk < allociblk + nitems(allociblk));
+ MPASS(lbns_remfree < lbns + nitems(lbns));
*allocblk++ = nb;
*lbns_remfree++ = indirs[1].in_lbn;
bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -309,7 +311,7 @@ retry:
if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
flags | IO_BUFLOCKED, cred, &newb)) != 0) {
brelse(bp);
- if (++reclaimed == 1) {
+ if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
@@ -325,6 +327,8 @@ retry:
}
pref = newb + fs->fs_frag;
nb = newb;
+ MPASS(allocblk < allociblk + nitems(allociblk));
+ MPASS(lbns_remfree < lbns + nitems(lbns));
*allocblk++ = nb;
*lbns_remfree++ = indirs[i].in_lbn;
nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
@@ -386,7 +390,7 @@ retry:
flags | IO_BUFLOCKED, cred, &newb);
if (error) {
brelse(bp);
- if (++reclaimed == 1) {
+ if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
@@ -401,6 +405,8 @@ retry:
goto fail;
}
nb = newb;
+ MPASS(allocblk < allociblk + nitems(allociblk));
+ MPASS(lbns_remfree < lbns + nitems(lbns));
*allocblk++ = nb;
*lbns_remfree++ = lbn;
nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -478,10 +484,16 @@ fail:
* We shall not leave the freed blocks on the vnode
* buffer object lists.
*/
- bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+ GB_NOCREAT | GB_UNMAPPED);
if (bp != NULL) {
- bp->b_flags |= (B_INVAL | B_RELBUF);
- bp->b_flags &= ~B_ASYNC;
+ KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
+ ("mismatch1 l %jd %jd b %ju %ju",
+ (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
+ (uintmax_t)bp->b_blkno,
+ (uintmax_t)fsbtodb(fs, *blkp)));
+ bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
+ bp->b_flags &= ~(B_ASYNC | B_CACHE);
brelse(bp);
}
deallocated += fs->fs_bsize;
@@ -524,6 +536,18 @@ fail:
* cleared, free the blocks.
*/
for (blkp = allociblk; blkp < allocblk; blkp++) {
+#ifdef INVARIANTS
+ if (blkp == allociblk)
+ lbns_remfree = lbns;
+ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+ GB_NOCREAT | GB_UNMAPPED);
+ if (bp != NULL) {
+ panic("zombie1 %jd %ju %ju",
+ (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
+ (uintmax_t)fsbtodb(fs, *blkp));
+ }
+ lbns_remfree++;
+#endif
ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
ip->i_number, vp->v_type, NULL);
}
@@ -818,6 +842,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
}
pref = newb + fs->fs_frag;
nb = newb;
+ MPASS(allocblk < allociblk + nitems(allociblk));
+ MPASS(lbns_remfree < lbns + nitems(lbns));
*allocblk++ = nb;
*lbns_remfree++ = indirs[1].in_lbn;
bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
@@ -873,7 +899,7 @@ retry:
if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
flags | IO_BUFLOCKED, cred, &newb)) != 0) {
brelse(bp);
- if (++reclaimed == 1) {
+ if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
@@ -889,6 +915,8 @@ retry:
}
pref = newb + fs->fs_frag;
nb = newb;
+ MPASS(allocblk < allociblk + nitems(allociblk));
+ MPASS(lbns_remfree < lbns + nitems(lbns));
*allocblk++ = nb;
*lbns_remfree++ = indirs[i].in_lbn;
nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
@@ -951,7 +979,7 @@ retry:
flags | IO_BUFLOCKED, cred, &newb);
if (error) {
brelse(bp);
- if (++reclaimed == 1) {
+ if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
@@ -966,6 +994,8 @@ retry:
goto fail;
}
nb = newb;
+ MPASS(allocblk < allociblk + nitems(allociblk));
+ MPASS(lbns_remfree < lbns + nitems(lbns));
*allocblk++ = nb;
*lbns_remfree++ = lbn;
nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
@@ -1049,10 +1079,16 @@ fail:
* We shall not leave the freed blocks on the vnode
* buffer object lists.
*/
- bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+ GB_NOCREAT | GB_UNMAPPED);
if (bp != NULL) {
- bp->b_flags |= (B_INVAL | B_RELBUF);
- bp->b_flags &= ~B_ASYNC;
+ KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
+ ("mismatch2 l %jd %jd b %ju %ju",
+ (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
+ (uintmax_t)bp->b_blkno,
+ (uintmax_t)fsbtodb(fs, *blkp)));
+ bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
+ bp->b_flags &= ~(B_ASYNC | B_CACHE);
brelse(bp);
}
deallocated += fs->fs_bsize;
@@ -1095,6 +1131,18 @@ fail:
* cleared, free the blocks.
*/
for (blkp = allociblk; blkp < allocblk; blkp++) {
+#ifdef INVARIANTS
+ if (blkp == allociblk)
+ lbns_remfree = lbns;
+ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
+ GB_NOCREAT | GB_UNMAPPED);
+ if (bp != NULL) {
+ panic("zombie2 %jd %ju %ju",
+ (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
+ (uintmax_t)fsbtodb(fs, *blkp));
+ }
+ lbns_remfree++;
+#endif
ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
ip->i_number, vp->v_type, NULL);
}
OpenPOWER on IntegriCloud