diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-09 08:33:31 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-09 08:33:31 -0700 |
commit | 26d2177e977c912863ac04f6c1a967e793ca3a56 (patch) | |
tree | 48da04fb0b947cfa404747690d7081b657e33221 /drivers/infiniband/ulp | |
parent | a794b4f3292160bb3fd0f1f90ec8df454e3b17b3 (diff) | |
parent | d1178cbcdcf91900ccf10a177350d7945703c151 (diff) | |
download | op-kernel-dev-26d2177e977c912863ac04f6c1a967e793ca3a56.zip op-kernel-dev-26d2177e977c912863ac04f6c1a967e793ca3a56.tar.gz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull inifiniband/rdma updates from Doug Ledford:
"This is a fairly sizeable set of changes. I've put them through a
decent amount of testing prior to sending the pull request due to
that.
There are still a few fixups that I know are coming, but I wanted to
go ahead and get the big, sizable chunk into your hands sooner rather
than waiting for those last few fixups.
Of note is the fact that this creates what is intended to be a
temporary area in the drivers/staging tree specifically for some
cleanups and additions that are coming for the RDMA stack. We
deprecated two drivers (ipath and amso1100) and are waiting to hear
back if we can deprecate another one (ehca). We also put Intel's new
hfi1 driver into this area because it needs to be refactored and a
transfer library created out of the factored out code, and then it and
the qib driver and the soft-roce driver should all be modified to use
that library.
I expect drivers/staging/rdma to be around for three or four kernel
releases and then to go away as all of the work is completed and final
deletions of deprecated drivers are done.
Summary of changes for 4.3:
- Create drivers/staging/rdma
- Move amso1100 driver to staging/rdma and schedule for deletion
- Move ipath driver to staging/rdma and schedule for deletion
- Add hfi1 driver to staging/rdma and set TODO for move to regular
tree
- Initial support for namespaces to be used on RDMA devices
- Add RoCE GID table handling to the RDMA core caching code
- Infrastructure to support handling of devices with differing read
and write scatter gather capabilities
- Various iSER updates
- Kill off unsafe usage of global mr registrations
- Update SRP driver
- Misc mlx4 driver updates
- Support for the mr_alloc verb
- Support for a netlink interface between kernel and user space cache
daemon to speed path record queries and route resolution
- Ininitial support for safe hot removal of verbs devices"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (136 commits)
IB/ipoib: Suppress warning for send only join failures
IB/ipoib: Clean up send-only multicast joins
IB/srp: Fix possible protection fault
IB/core: Move SM class defines from ib_mad.h to ib_smi.h
IB/core: Remove unnecessary defines from ib_mad.h
IB/hfi1: Add PSM2 user space header to header_install
IB/hfi1: Add CSRs for CONFIG_SDMA_VERBOSITY
mlx5: Fix incorrect wc pkey_index assignment for GSI messages
IB/mlx5: avoid destroying a NULL mr in reg_user_mr error flow
IB/uverbs: reject invalid or unknown opcodes
IB/cxgb4: Fix if statement in pick_local_ip6adddrs
IB/sa: Fix rdma netlink message flags
IB/ucma: HW Device hot-removal support
IB/mlx4_ib: Disassociate support
IB/uverbs: Enable device removal when there are active user space applications
IB/uverbs: Explicitly pass ib_dev to uverbs commands
IB/uverbs: Fix race between ib_uverbs_open and remove_one
IB/uverbs: Fix reference counting usage of event files
IB/core: Make ib_dealloc_pd return void
IB/srp: Create an insecure all physical rkey only if needed
...
Diffstat (limited to 'drivers/infiniband/ulp')
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib.h | 1 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_cm.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_main.c | 236 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 50 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 22 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iscsi_iser.c | 91 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iscsi_iser.h | 206 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_initiator.c | 38 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_memory.c | 482 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_verbs.c | 339 | ||||
-rw-r--r-- | drivers/infiniband/ulp/isert/ib_isert.c | 47 | ||||
-rw-r--r-- | drivers/infiniband/ulp/isert/ib_isert.h | 1 | ||||
-rw-r--r-- | drivers/infiniband/ulp/srp/ib_srp.c | 282 | ||||
-rw-r--r-- | drivers/infiniband/ulp/srp/ib_srp.h | 25 | ||||
-rw-r--r-- | drivers/infiniband/ulp/srpt/ib_srpt.c | 22 | ||||
-rw-r--r-- | drivers/infiniband/ulp/srpt/ib_srpt.h | 1 |
16 files changed, 1129 insertions, 718 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 79859c4..ca28736 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -342,7 +342,6 @@ struct ipoib_dev_priv { u16 pkey; u16 pkey_index; struct ib_pd *pd; - struct ib_mr *mr; struct ib_cq *recv_cq; struct ib_cq *send_cq; struct ib_qp *qp; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index ee39be6..c78dc16 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -332,7 +332,7 @@ static void ipoib_cm_init_rx_wr(struct net_device *dev, int i; for (i = 0; i < priv->cm.num_frags; ++i) - sge[i].lkey = priv->mr->lkey; + sge[i].lkey = priv->pd->local_dma_lkey; sge[0].length = IPOIB_CM_HEAD_SIZE; for (i = 1; i < priv->cm.num_frags; ++i) @@ -848,7 +848,7 @@ int ipoib_cm_dev_open(struct net_device *dev) } ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), - 0, NULL); + 0); if (ret) { printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, IPOIB_CM_IETF_ID | priv->qp->qp_num); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index b2943c8..36536ce 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -48,6 +48,9 @@ #include <linux/jhash.h> #include <net/arp.h> +#include <net/addrconf.h> +#include <linux/inetdevice.h> +#include <rdma/ib_cache.h> #define DRV_VERSION "1.0.0" @@ -89,13 +92,18 @@ struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); -static void ipoib_remove_one(struct ib_device *device); +static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_neigh_reclaim(struct rcu_head *rp); +static struct net_device *ipoib_get_net_dev_by_params( + struct ib_device *dev, u8 port, u16 pkey, + const union ib_gid *gid, const struct sockaddr *addr, + void *client_data); static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, - .remove = ipoib_remove_one + .remove = ipoib_remove_one, + .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; int ipoib_open(struct net_device *dev) @@ -222,6 +230,225 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) return 0; } +/* Called with an RCU read lock taken */ +static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, + struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct in_device *in_dev; + struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; + struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr; + __be32 ret_addr; + + switch (addr->sa_family) { + case AF_INET: + in_dev = in_dev_get(dev); + if (!in_dev) + return false; + + ret_addr = inet_confirm_addr(net, in_dev, 0, + addr_in->sin_addr.s_addr, + RT_SCOPE_HOST); + in_dev_put(in_dev); + if (ret_addr) + return true; + + break; + case AF_INET6: + if (IS_ENABLED(CONFIG_IPV6) && + ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) + return true; + + break; + } + return false; +} + +/** + * Find the master net_device on top of the given net_device. + * @dev: base IPoIB net_device + * + * Returns the master net_device with a reference held, or the same net_device + * if no master exists. + */ +static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) +{ + struct net_device *master; + + rcu_read_lock(); + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + dev_hold(master); + rcu_read_unlock(); + + if (master) + return master; + + dev_hold(dev); + return dev; +} + +/** + * Find a net_device matching the given address, which is an upper device of + * the given net_device. + * @addr: IP address to look for. + * @dev: base IPoIB net_device + * + * If found, returns the net_device with a reference held. Otherwise return + * NULL. + */ +static struct net_device *ipoib_get_net_dev_match_addr( + const struct sockaddr *addr, struct net_device *dev) +{ + struct net_device *upper, + *result = NULL; + struct list_head *iter; + + rcu_read_lock(); + if (ipoib_is_dev_match_addr_rcu(addr, dev)) { + dev_hold(dev); + result = dev; + goto out; + } + + netdev_for_each_all_upper_dev_rcu(dev, upper, iter) { + if (ipoib_is_dev_match_addr_rcu(addr, upper)) { + dev_hold(upper); + result = upper; + break; + } + } +out: + rcu_read_unlock(); + return result; +} + +/* returns the number of IPoIB netdevs on top a given ipoib device matching a + * pkey_index and address, if one exists. + * + * @found_net_dev: contains a matching net_device if the return value >= 1, + * with a reference held. */ +static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, + const union ib_gid *gid, + u16 pkey_index, + const struct sockaddr *addr, + int nesting, + struct net_device **found_net_dev) +{ + struct ipoib_dev_priv *child_priv; + struct net_device *net_dev = NULL; + int matches = 0; + + if (priv->pkey_index == pkey_index && + (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { + if (!addr) { + net_dev = ipoib_get_master_net_dev(priv->dev); + } else { + /* Verify the net_device matches the IP address, as + * IPoIB child devices currently share a GID. */ + net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev); + } + if (net_dev) { + if (!*found_net_dev) + *found_net_dev = net_dev; + else + dev_put(net_dev); + ++matches; + } + } + + /* Check child interfaces */ + down_read_nested(&priv->vlan_rwsem, nesting); + list_for_each_entry(child_priv, &priv->child_intfs, list) { + matches += ipoib_match_gid_pkey_addr(child_priv, gid, + pkey_index, addr, + nesting + 1, + found_net_dev); + if (matches > 1) + break; + } + up_read(&priv->vlan_rwsem); + + return matches; +} + +/* Returns the number of matching net_devs found (between 0 and 2). Also + * return the matching net_device in the @net_dev parameter, holding a + * reference to the net_device, if the number of matches >= 1 */ +static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port, + u16 pkey_index, + const union ib_gid *gid, + const struct sockaddr *addr, + struct net_device **net_dev) +{ + struct ipoib_dev_priv *priv; + int matches = 0; + + *net_dev = NULL; + + list_for_each_entry(priv, dev_list, list) { + if (priv->port != port) + continue; + + matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, + addr, 0, net_dev); + if (matches > 1) + break; + } + + return matches; +} + +static struct net_device *ipoib_get_net_dev_by_params( + struct ib_device *dev, u8 port, u16 pkey, + const union ib_gid *gid, const struct sockaddr *addr, + void *client_data) +{ + struct net_device *net_dev; + struct list_head *dev_list = client_data; + u16 pkey_index; + int matches; + int ret; + + if (!rdma_protocol_ib(dev, port)) + return NULL; + + ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); + if (ret) + return NULL; + + if (!dev_list) + return NULL; + + /* See if we can find a unique device matching the L2 parameters */ + matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, + gid, NULL, &net_dev); + + switch (matches) { + case 0: + return NULL; + case 1: + return net_dev; + } + + dev_put(net_dev); + + /* Couldn't find a unique device with L2 parameters only. Use L3 + * address to uniquely match the net device */ + matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, + gid, addr, &net_dev); + switch (matches) { + case 0: + return NULL; + default: + dev_warn_ratelimited(&dev->dev, + "duplicate IP address detected\n"); + /* Fall through */ + case 1: + return net_dev; + } +} + int ipoib_set_mode(struct net_device *dev, const char *buf) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -1715,12 +1942,11 @@ static void ipoib_add_one(struct ib_device *device) ib_set_client_data(device, &ipoib_client, dev_list); } -static void ipoib_remove_one(struct ib_device *device) +static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp; - struct list_head *dev_list; + struct list_head *dev_list = client_data; - dev_list = ib_get_client_data(device, &ipoib_client); if (!dev_list) return; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 0d23e05..09a1748 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -393,8 +393,13 @@ static int ipoib_mcast_join_complete(int status, goto out_locked; } } else { - if (mcast->logcount++ < 20) { - if (status == -ETIMEDOUT || status == -EAGAIN) { + bool silent_fail = + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + status == -EINVAL; + + if (mcast->logcount < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN || + silent_fail) { ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n", test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", mcast->mcmember.mgid.raw, status); @@ -403,6 +408,9 @@ static int ipoib_mcast_join_complete(int status, test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", mcast->mcmember.mgid.raw, status); } + + if (!silent_fail) + mcast->logcount++; } if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && @@ -448,8 +456,7 @@ out_locked: return status; } -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, - int create) +static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_sa_multicast *multicast; @@ -471,7 +478,14 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE; - if (create) { + if (mcast != priv->broadcast) { + /* + * RFC 4391: + * The MGID MUST use the same P_Key, Q_Key, SL, MTU, + * and HopLimit as those used in the broadcast-GID. The rest + * of attributes SHOULD follow the values used in the + * broadcast-GID as well. + */ comp_mask |= IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_MTU_SELECTOR | @@ -492,6 +506,22 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, rec.sl = priv->broadcast->mcmember.sl; rec.flow_label = priv->broadcast->mcmember.flow_label; rec.hop_limit = priv->broadcast->mcmember.hop_limit; + + /* + * Historically Linux IPoIB has never properly supported SEND + * ONLY join. It emulated it by not providing all the required + * attributes, which is enough to prevent group creation and + * detect if there are full members or not. A major problem + * with supporting SEND ONLY is detecting when the group is + * auto-destroyed as IPoIB will cache the MLID.. + */ +#if 1 + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + comp_mask &= ~IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; +#else + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + rec.join_state = 4; +#endif } multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, @@ -517,7 +547,6 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ib_port_attr port_attr; unsigned long delay_until = 0; struct ipoib_mcast *mcast = NULL; - int create = 1; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return; @@ -566,7 +595,6 @@ void ipoib_mcast_join_task(struct work_struct *work) if (IS_ERR_OR_NULL(priv->broadcast->mc) && !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) { mcast = priv->broadcast; - create = 0; if (mcast->backoff > 1 && time_before(jiffies, mcast->delay_until)) { delay_until = mcast->delay_until; @@ -590,12 +618,8 @@ void ipoib_mcast_join_task(struct work_struct *work) /* Found the next unjoined group */ init_completion(&mcast->done); set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - create = 0; - else - create = 1; spin_unlock_irq(&priv->lock); - ipoib_mcast_join(dev, mcast, create); + ipoib_mcast_join(dev, mcast); spin_lock_irq(&priv->lock); } else if (!delay_until || time_before(mcast->delay_until, delay_until)) @@ -618,7 +642,7 @@ out: } spin_unlock_irq(&priv->lock); if (mcast) - ipoib_mcast_join(dev, mcast, create); + ipoib_mcast_join(dev, mcast); } int ipoib_mcast_start_thread(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 851c821..78845b6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -152,12 +152,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) return -ENODEV; } - priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(priv->mr)) { - printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); - goto out_free_pd; - } - /* * the various IPoIB tasks assume they will never race against * themselves, so always use a single thread workqueue @@ -165,7 +159,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->wq = create_singlethread_workqueue("ipoib_wq"); if (!priv->wq) { printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); - goto out_free_mr; + goto out_free_pd; } size = ipoib_recvq_size + 1; @@ -225,13 +219,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) - priv->tx_sge[i].lkey = priv->mr->lkey; + priv->tx_sge[i].lkey = priv->pd->local_dma_lkey; priv->tx_wr.opcode = IB_WR_SEND; priv->tx_wr.sg_list = priv->tx_sge; priv->tx_wr.send_flags = IB_SEND_SIGNALED; - priv->rx_sge[0].lkey = priv->mr->lkey; + priv->rx_sge[0].lkey = priv->pd->local_dma_lkey; priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); priv->rx_wr.num_sge = 1; @@ -254,9 +248,6 @@ out_free_wq: destroy_workqueue(priv->wq); priv->wq = NULL; -out_free_mr: - ib_dereg_mr(priv->mr); - out_free_pd: ib_dealloc_pd(priv->pd); @@ -289,12 +280,7 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) priv->wq = NULL; } - if (ib_dereg_mr(priv->mr)) - ipoib_warn(priv, "ib_dereg_mr failed\n"); - - if (ib_dealloc_pd(priv->pd)) - ipoib_warn(priv, "ib_dealloc_pd failed\n"); - + ib_dealloc_pd(priv->pd); } void ipoib_event(struct ib_event_handler *handler, diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 6a594aa..1ace5d8 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -74,34 +74,37 @@ #include "iscsi_iser.h" +MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); +MODULE_VERSION(DRV_VER); + static struct scsi_host_template iscsi_iser_sht; static struct iscsi_transport iscsi_iser_transport; static struct scsi_transport_template *iscsi_iser_scsi_transport; - -static unsigned int iscsi_max_lun = 512; -module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); +static struct workqueue_struct *release_wq; +struct iser_global ig; int iser_debug_level = 0; -bool iser_pi_enable = false; -int iser_pi_guard = 1; +module_param_named(debug_level, iser_debug_level, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); -MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); -MODULE_VERSION(DRV_VER); +static unsigned int iscsi_max_lun = 512; +module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); +MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session (default:512"); -module_param_named(debug_level, iser_debug_level, int, 0644); -MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); +unsigned int iser_max_sectors = ISER_DEF_MAX_SECTORS; +module_param_named(max_sectors, iser_max_sectors, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command (default:1024"); -module_param_named(pi_enable, iser_pi_enable, bool, 0644); +bool iser_pi_enable = false; +module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO); MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); -module_param_named(pi_guard, iser_pi_guard, int, 0644); +int iser_pi_guard; +module_param_named(pi_guard, iser_pi_guard, int, S_IRUGO); MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]"); -static struct workqueue_struct *release_wq; -struct iser_global ig; - /* * iscsi_iser_recv() - Process a successfull recv completion * @conn: iscsi connection @@ -201,10 +204,12 @@ iser_initialize_task_headers(struct iscsi_task *task, goto out; } + tx_desc->wr_idx = 0; + tx_desc->mapped = true; tx_desc->dma_addr = dma_addr; tx_desc->tx_sg[0].addr = tx_desc->dma_addr; tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; - tx_desc->tx_sg[0].lkey = device->mr->lkey; + tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey; iser_task->iser_conn = iser_conn; out: @@ -360,16 +365,19 @@ iscsi_iser_task_xmit(struct iscsi_task *task) static void iscsi_iser_cleanup_task(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_tx_desc *tx_desc = &iser_task->desc; - struct iser_conn *iser_conn = task->conn->dd_data; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct iser_conn *iser_conn = task->conn->dd_data; struct iser_device *device = iser_conn->ib_conn.device; /* DEVICE_REMOVAL event might have already released the device */ if (!device) return; - ib_dma_unmap_single(device->ib_device, - tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (likely(tx_desc->mapped)) { + ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->mapped = false; + } /* mgmt tasks do not need special cleanup */ if (!task->sc) @@ -622,6 +630,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, if (ep) { iser_conn = ep->dd_data; max_cmds = iser_conn->max_cmds; + shost->sg_tablesize = iser_conn->scsi_sg_tablesize; + shost->max_sectors = iser_conn->scsi_max_sectors; mutex_lock(&iser_conn->state_mutex); if (iser_conn->state != ISER_CONN_UP) { @@ -640,6 +650,15 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, SHOST_DIX_GUARD_CRC); } + /* + * Limit the sg_tablesize and max_sectors based on the device + * max fastreg page list length. + */ + shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize, + ib_conn->device->dev_attr.max_fast_reg_page_list_len); + shost->max_sectors = min_t(unsigned int, + 1024, (shost->sg_tablesize * PAGE_SIZE) >> 9); + if (iscsi_host_add(shost, ib_conn->device->ib_device->dma_device)) { mutex_unlock(&iser_conn->state_mutex); @@ -742,15 +761,9 @@ iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *s stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */ stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; - stats->custom_length = 4; - strcpy(stats->custom[0].desc, "qp_tx_queue_full"); - stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */ - strcpy(stats->custom[1].desc, "fmr_map_not_avail"); - stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */; - strcpy(stats->custom[2].desc, "eh_abort_cnt"); - stats->custom[2].value = conn->eh_abort_cnt; - strcpy(stats->custom[3].desc, "fmr_unalign_cnt"); - stats->custom[3].value = conn->fmr_unalign_cnt; + stats->custom_length = 1; + strcpy(stats->custom[0].desc, "fmr_unalign_cnt"); + stats->custom[0].value = conn->fmr_unalign_cnt; } static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, @@ -839,10 +852,9 @@ failure: static int iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) { - struct iser_conn *iser_conn; + struct iser_conn *iser_conn = ep->dd_data; int rc; - iser_conn = ep->dd_data; rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion, msecs_to_jiffies(timeout_ms)); /* if conn establishment failed, return error code to iscsi */ @@ -854,7 +866,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) mutex_unlock(&iser_conn->state_mutex); } - iser_info("ib conn %p rc = %d\n", iser_conn, rc); + iser_info("iser conn %p rc = %d\n", iser_conn, rc); if (rc > 0) return 1; /* success, this is the equivalent of POLLOUT */ @@ -876,11 +888,9 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) static void iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) { - struct iser_conn *iser_conn; + struct iser_conn *iser_conn = ep->dd_data; - iser_conn = ep->dd_data; - iser_info("ep %p iser conn %p state %d\n", - ep, iser_conn, iser_conn->state); + iser_info("ep %p iser conn %p\n", ep, iser_conn); mutex_lock(&iser_conn->state_mutex); iser_conn_terminate(iser_conn); @@ -900,6 +910,7 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) mutex_unlock(&iser_conn->state_mutex); iser_conn_release(iser_conn); } + iscsi_destroy_endpoint(ep); } @@ -962,8 +973,8 @@ static struct scsi_host_template iscsi_iser_sht = { .name = "iSCSI Initiator over iSER", .queuecommand = iscsi_queuecommand, .change_queue_depth = scsi_change_queue_depth, - .sg_tablesize = ISCSI_ISER_SG_TABLESIZE, - .max_sectors = 1024, + .sg_tablesize = ISCSI_ISER_DEF_SG_TABLESIZE, + .max_sectors = ISER_DEF_MAX_SECTORS, .cmd_per_lun = ISER_DEF_CMD_PER_LUN, .eh_abort_handler = iscsi_eh_abort, .eh_device_reset_handler= iscsi_eh_device_reset, @@ -1074,7 +1085,7 @@ static void __exit iser_exit(void) if (!connlist_empty) { iser_err("Error cleanup stage completed but we still have iser " - "connections, destroying them anyway.\n"); + "connections, destroying them anyway\n"); list_for_each_entry_safe(iser_conn, n, &ig.connlist, conn_list) { iser_conn_release(iser_conn); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 262ba1f..86f6583 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -98,8 +98,13 @@ #define SHIFT_4K 12 #define SIZE_4K (1ULL << SHIFT_4K) #define MASK_4K (~(SIZE_4K-1)) - /* support up to 512KB in one RDMA */ -#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) + +/* Default support is 512KB I/O size */ +#define ISER_DEF_MAX_SECTORS 1024 +#define ISCSI_ISER_DEF_SG_TABLESIZE ((ISER_DEF_MAX_SECTORS * 512) >> SHIFT_4K) +/* Maximum support is 8MB I/O size */ +#define ISCSI_ISER_MAX_SG_TABLESIZE ((16384 * 512) >> SHIFT_4K) + #define ISER_DEF_XMIT_CMDS_DEFAULT 512 #if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT #define ISER_DEF_XMIT_CMDS_MAX ISCSI_DEF_XMIT_CMDS_MAX @@ -239,6 +244,7 @@ struct iser_data_buf { struct iser_device; struct iscsi_iser_task; struct iscsi_endpoint; +struct iser_reg_resources; /** * struct iser_mem_reg - iSER memory registration info @@ -259,6 +265,14 @@ enum iser_desc_type { ISCSI_TX_DATAOUT }; +/* Maximum number of work requests per task: + * Data memory region local invalidate + fast registration + * Protection memory region local invalidate + fast registration + * Signature memory region local invalidate + fast registration + * PDU send + */ +#define ISER_MAX_WRS 7 + /** * struct iser_tx_desc - iSER TX descriptor (for send wr_id) * @@ -270,6 +284,12 @@ enum iser_desc_type { * sg[1] optionally points to either of immediate data * unsolicited data-out or control * @num_sge: number sges used on this TX task + * @mapped: Is the task header mapped + * @wr_idx: Current WR index + * @wrs: Array of WRs per task + * @data_reg: Data buffer registration details + * @prot_reg: Protection buffer registration details + * @sig_attrs: Signature attributes */ struct iser_tx_desc { struct iser_hdr iser_header; @@ -278,6 +298,12 @@ struct iser_tx_desc { u64 dma_addr; struct ib_sge tx_sg[2]; int num_sge; + bool mapped; + u8 wr_idx; + struct ib_send_wr wrs[ISER_MAX_WRS]; + struct iser_mem_reg data_reg; + struct iser_mem_reg prot_reg; + struct ib_sig_attrs sig_attrs; }; #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ @@ -324,6 +350,33 @@ struct iser_comp { }; /** + * struct iser_device - Memory registration operations + * per-device registration schemes + * + * @alloc_reg_res: Allocate registration resources + * @free_reg_res: Free registration resources + * @fast_reg_mem: Register memory buffers + * @unreg_mem: Un-register memory buffers + * @reg_desc_get: Get a registration descriptor for pool + * @reg_desc_put: Get a registration descriptor to pool + */ +struct iser_reg_ops { + int (*alloc_reg_res)(struct ib_conn *ib_conn, + unsigned cmds_max, + unsigned int size); + void (*free_reg_res)(struct ib_conn *ib_conn); + int (*reg_mem)(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_reg_resources *rsc, + struct iser_mem_reg *reg); + void (*unreg_mem)(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); + struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn); + void (*reg_desc_put)(struct ib_conn *ib_conn, + struct iser_fr_desc *desc); +}; + +/** * struct iser_device - iSER device handle * * @ib_device: RDMA device @@ -336,11 +389,7 @@ struct iser_comp { * @comps_used: Number of completion contexts used, Min between online * cpus and device max completion vectors * @comps: Dinamically allocated array of completion handlers - * Memory registration pool Function pointers (FMR or Fastreg): - * @iser_alloc_rdma_reg_res: Allocation of memory regions pool - * @iser_free_rdma_reg_res: Free of memory regions pool - * @iser_reg_rdma_mem: Memory registration routine - * @iser_unreg_rdma_mem: Memory deregistration routine + * @reg_ops: Registration ops */ struct iser_device { struct ib_device *ib_device; @@ -352,54 +401,73 @@ struct iser_device { int refcount; int comps_used; struct iser_comp *comps; - int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, - unsigned cmds_max); - void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); - int (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir); - void (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir); + struct iser_reg_ops *reg_ops; }; #define ISER_CHECK_GUARD 0xc0 #define ISER_CHECK_REFTAG 0x0f #define ISER_CHECK_APPTAG 0x30 -enum iser_reg_indicator { - ISER_DATA_KEY_VALID = 1 << 0, - ISER_PROT_KEY_VALID = 1 << 1, - ISER_SIG_KEY_VALID = 1 << 2, - ISER_FASTREG_PROTECTED = 1 << 3, +/** + * struct iser_reg_resources - Fast registration recources + * + * @mr: memory region + * @fmr_pool: pool of fmrs + * @frpl: fast reg page list used by frwrs + * @page_vec: fast reg page list used by fmr pool + * @mr_valid: is mr valid indicator + */ +struct iser_reg_resources { + union { + struct ib_mr *mr; + struct ib_fmr_pool *fmr_pool; + }; + union { + struct ib_fast_reg_page_list *frpl; + struct iser_page_vec *page_vec; + }; + u8 mr_valid:1; }; /** * struct iser_pi_context - Protection information context * - * @prot_mr: protection memory region - * @prot_frpl: protection fastreg page list - * @sig_mr: signature feature enabled memory region + * @rsc: protection buffer registration resources + * @sig_mr: signature enable memory region + * @sig_mr_valid: is sig_mr valid indicator + * @sig_protected: is region protected indicator */ struct iser_pi_context { - struct ib_mr *prot_mr; - struct ib_fast_reg_page_list *prot_frpl; + struct iser_reg_resources rsc; struct ib_mr *sig_mr; + u8 sig_mr_valid:1; + u8 sig_protected:1; }; /** - * struct fast_reg_descriptor - Fast registration descriptor + * struct iser_fr_desc - Fast registration descriptor * * @list: entry in connection fastreg pool - * @data_mr: data memory region - * @data_frpl: data fastreg page list + * @rsc: data buffer registration resources * @pi_ctx: protection information context - * @reg_indicators: fast registration indicators */ -struct fast_reg_descriptor { +struct iser_fr_desc { struct list_head list; - struct ib_mr *data_mr; - struct ib_fast_reg_page_list *data_frpl; + struct iser_reg_resources rsc; struct iser_pi_context *pi_ctx; - u8 reg_indicators; +}; + +/** + * struct iser_fr_pool: connection fast registration pool + * + * @list: list of fastreg descriptors + * @lock: protects fmr/fastreg pool + * @size: size of the pool + */ +struct iser_fr_pool { + struct list_head list; + spinlock_t lock; + int size; }; /** @@ -415,15 +483,7 @@ struct fast_reg_descriptor { * @pi_support: Indicate device T10-PI support * @beacon: beacon send wr to signal all flush errors were drained * @flush_comp: completes when all connection completions consumed - * @lock: protects fmr/fastreg pool - * @union.fmr: - * @pool: FMR pool for fast registrations - * @page_vec: page vector to hold mapped commands pages - * used for registration - * @union.fastreg: - * @pool: Fast registration descriptors pool for fast - * registrations - * @pool_size: Size of pool + * @fr_pool: connection fast registration poool */ struct ib_conn { struct rdma_cm_id *cma_id; @@ -436,17 +496,7 @@ struct ib_conn { bool pi_support; struct ib_send_wr beacon; struct completion flush_comp; - spinlock_t lock; - union { - struct { - struct ib_fmr_pool *pool; - struct iser_page_vec *page_vec; - } fmr; - struct { - struct list_head pool; - int pool_size; - } fastreg; - }; + struct iser_fr_pool fr_pool; }; /** @@ -477,6 +527,8 @@ struct ib_conn { * @rx_desc_head: head of rx_descs cyclic buffer * @rx_descs: rx buffers array (cyclic buffer) * @num_rx_descs: number of rx descriptors + * @scsi_sg_tablesize: scsi host sg_tablesize + * @scsi_max_sectors: scsi host max sectors */ struct iser_conn { struct ib_conn ib_conn; @@ -501,6 +553,8 @@ struct iser_conn { unsigned int rx_desc_head; struct iser_rx_desc *rx_descs; u32 num_rx_descs; + unsigned short scsi_sg_tablesize; + unsigned int scsi_max_sectors; }; /** @@ -556,6 +610,9 @@ extern struct iser_global ig; extern int iser_debug_level; extern bool iser_pi_enable; extern int iser_pi_guard; +extern unsigned int iser_max_sectors; + +int iser_assign_reg_ops(struct iser_device *device); int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task); @@ -597,10 +654,10 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, enum iser_data_dir cmd_dir); -int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, - enum iser_data_dir cmd_dir); -int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, - enum iser_data_dir cmd_dir); +int iser_reg_rdma_mem(struct iscsi_iser_task *task, + enum iser_data_dir dir); +void iser_unreg_rdma_mem(struct iscsi_iser_task *task, + enum iser_data_dir dir); int iser_connect(struct iser_conn *iser_conn, struct sockaddr *src_addr, @@ -630,15 +687,40 @@ int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc); int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, struct iscsi_session *session); -int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max); +int iser_alloc_fmr_pool(struct ib_conn *ib_conn, + unsigned cmds_max, + unsigned int size); void iser_free_fmr_pool(struct ib_conn *ib_conn); -int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max); +int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, + unsigned cmds_max, + unsigned int size); void iser_free_fastreg_pool(struct ib_conn *ib_conn); u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector); -struct fast_reg_descriptor * -iser_reg_desc_get(struct ib_conn *ib_conn); +struct iser_fr_desc * +iser_reg_desc_get_fr(struct ib_conn *ib_conn); void -iser_reg_desc_put(struct ib_conn *ib_conn, - struct fast_reg_descriptor *desc); +iser_reg_desc_put_fr(struct ib_conn *ib_conn, + struct iser_fr_desc *desc); +struct iser_fr_desc * +iser_reg_desc_get_fmr(struct ib_conn *ib_conn); +void +iser_reg_desc_put_fmr(struct ib_conn *ib_conn, + struct iser_fr_desc *desc); + +static inline struct ib_send_wr * +iser_tx_next_wr(struct iser_tx_desc *tx_desc) +{ + struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx]; + struct ib_send_wr *last_wr; + + if (tx_desc->wr_idx) { + last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1]; + last_wr->next = cur_wr; + } + tx_desc->wr_idx++; + + return cur_wr; +} + #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 3e2118e..d511879 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -49,7 +49,6 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_device *device = iser_task->iser_conn->ib_conn.device; struct iser_mem_reg *mem_reg; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -73,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) return err; } - err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN); + err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN); if (err) { iser_err("Failed to set up Data-IN RDMA\n"); return err; @@ -103,7 +102,6 @@ iser_prepare_write_cmd(struct iscsi_task *task, unsigned int edtl) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_device *device = iser_task->iser_conn->ib_conn.device; struct iser_mem_reg *mem_reg; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -128,7 +126,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, return err; } - err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); + err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); if (err != 0) { iser_err("Failed to register write cmd RDMA mem\n"); return err; @@ -170,13 +168,7 @@ static void iser_create_send_desc(struct iser_conn *iser_conn, memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); tx_desc->iser_header.flags = ISER_VER; - tx_desc->num_sge = 1; - - if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { - tx_desc->tx_sg[0].lkey = device->mr->lkey; - iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc); - } } static void iser_free_login_buf(struct iser_conn *iser_conn) @@ -266,7 +258,8 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; - if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max)) + if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max, + iser_conn->scsi_sg_tablesize)) goto create_rdma_reg_res_failed; if (iser_alloc_login_buf(iser_conn)) @@ -291,7 +284,7 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, rx_sg = &rx_desc->rx_sg; rx_sg->addr = rx_desc->dma_addr; rx_sg->length = ISER_RX_PAYLOAD_SIZE; - rx_sg->lkey = device->mr->lkey; + rx_sg->lkey = device->pd->local_dma_lkey; } iser_conn->rx_desc_head = 0; @@ -307,7 +300,7 @@ rx_desc_dma_map_failed: rx_desc_alloc_fail: iser_free_login_buf(iser_conn); alloc_login_buf_fail: - device->iser_free_rdma_reg_res(ib_conn); + device->reg_ops->free_reg_res(ib_conn); create_rdma_reg_res_failed: iser_err("failed allocating rx descriptors / data buffers\n"); return -ENOMEM; @@ -320,8 +313,8 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn) struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - if (device->iser_free_rdma_reg_res) - device->iser_free_rdma_reg_res(ib_conn); + if (device->reg_ops->free_reg_res) + device->reg_ops->free_reg_res(ib_conn); rx_desc = iser_conn->rx_descs; for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) @@ -454,7 +447,7 @@ int iser_send_data_out(struct iscsi_conn *conn, unsigned long buf_offset; unsigned long data_seg_len; uint32_t itt; - int err = 0; + int err; struct ib_sge *tx_dsg; itt = (__force uint32_t)hdr->itt; @@ -475,7 +468,9 @@ int iser_send_data_out(struct iscsi_conn *conn, memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); /* build the tx desc */ - iser_initialize_task_headers(task, tx_desc); + err = iser_initialize_task_headers(task, tx_desc); + if (err) + goto send_data_out_error; mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; tx_dsg = &tx_desc->tx_sg[1]; @@ -502,7 +497,7 @@ int iser_send_data_out(struct iscsi_conn *conn, send_data_out_error: kmem_cache_free(ig.desc_cache, tx_desc); - iser_err("conn %p failed err %d\n",conn, err); + iser_err("conn %p failed err %d\n", conn, err); return err; } @@ -543,7 +538,7 @@ int iser_send_control(struct iscsi_conn *conn, tx_dsg->addr = iser_conn->login_req_dma; tx_dsg->length = task->data_count; - tx_dsg->lkey = device->mr->lkey; + tx_dsg->lkey = device->pd->local_dma_lkey; mdesc->num_sge = 2; } @@ -666,7 +661,6 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) { - struct iser_device *device = iser_task->iser_conn->ib_conn.device; int is_rdma_data_aligned = 1; int is_rdma_prot_aligned = 1; int prot_count = scsi_prot_sg_count(iser_task->sc); @@ -703,7 +697,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) } if (iser_task->dir[ISER_DIR_IN]) { - device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); + iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); if (is_rdma_data_aligned) iser_dma_unmap_task_data(iser_task, &iser_task->data[ISER_DIR_IN], @@ -715,7 +709,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) } if (iser_task->dir[ISER_DIR_OUT]) { - device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); + iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); if (is_rdma_data_aligned) iser_dma_unmap_task_data(iser_task, &iser_task->data[ISER_DIR_OUT], diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index f0cdc96..2493cc7 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -38,6 +38,55 @@ #include <linux/scatterlist.h> #include "iscsi_iser.h" +static +int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_reg_resources *rsc, + struct iser_mem_reg *mem_reg); +static +int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_reg_resources *rsc, + struct iser_mem_reg *mem_reg); + +static struct iser_reg_ops fastreg_ops = { + .alloc_reg_res = iser_alloc_fastreg_pool, + .free_reg_res = iser_free_fastreg_pool, + .reg_mem = iser_fast_reg_mr, + .unreg_mem = iser_unreg_mem_fastreg, + .reg_desc_get = iser_reg_desc_get_fr, + .reg_desc_put = iser_reg_desc_put_fr, +}; + +static struct iser_reg_ops fmr_ops = { + .alloc_reg_res = iser_alloc_fmr_pool, + .free_reg_res = iser_free_fmr_pool, + .reg_mem = iser_fast_reg_fmr, + .unreg_mem = iser_unreg_mem_fmr, + .reg_desc_get = iser_reg_desc_get_fmr, + .reg_desc_put = iser_reg_desc_put_fmr, +}; + +int iser_assign_reg_ops(struct iser_device *device) +{ + struct ib_device_attr *dev_attr = &device->dev_attr; + + /* Assign function handles - based on FMR support */ + if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && + device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { + iser_info("FMR supported, using FMR for registration\n"); + device->reg_ops = &fmr_ops; + } else + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + iser_info("FastReg supported, using FastReg for registration\n"); + device->reg_ops = &fastreg_ops; + } else { + iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); + return -1; + } + + return 0; +} static void iser_free_bounce_sg(struct iser_data_buf *data) @@ -146,30 +195,47 @@ iser_copy_to_bounce(struct iser_data_buf *data) iser_copy_bounce(data, true); } -struct fast_reg_descriptor * -iser_reg_desc_get(struct ib_conn *ib_conn) +struct iser_fr_desc * +iser_reg_desc_get_fr(struct ib_conn *ib_conn) { - struct fast_reg_descriptor *desc; + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_fr_desc *desc; unsigned long flags; - spin_lock_irqsave(&ib_conn->lock, flags); - desc = list_first_entry(&ib_conn->fastreg.pool, - struct fast_reg_descriptor, list); + spin_lock_irqsave(&fr_pool->lock, flags); + desc = list_first_entry(&fr_pool->list, + struct iser_fr_desc, list); list_del(&desc->list); - spin_unlock_irqrestore(&ib_conn->lock, flags); + spin_unlock_irqrestore(&fr_pool->lock, flags); return desc; } void -iser_reg_desc_put(struct ib_conn *ib_conn, - struct fast_reg_descriptor *desc) +iser_reg_desc_put_fr(struct ib_conn *ib_conn, + struct iser_fr_desc *desc) { + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; unsigned long flags; - spin_lock_irqsave(&ib_conn->lock, flags); - list_add(&desc->list, &ib_conn->fastreg.pool); - spin_unlock_irqrestore(&ib_conn->lock, flags); + spin_lock_irqsave(&fr_pool->lock, flags); + list_add(&desc->list, &fr_pool->list); + spin_unlock_irqrestore(&fr_pool->lock, flags); +} + +struct iser_fr_desc * +iser_reg_desc_get_fmr(struct ib_conn *ib_conn) +{ + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + + return list_first_entry(&fr_pool->list, + struct iser_fr_desc, list); +} + +void +iser_reg_desc_put_fmr(struct ib_conn *ib_conn, + struct iser_fr_desc *desc) +{ } /** @@ -297,7 +363,8 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, * consecutive SG elements are actually fragments of the same physcial page. */ static int iser_data_buf_aligned_len(struct iser_data_buf *data, - struct ib_device *ibdev) + struct ib_device *ibdev, + unsigned sg_tablesize) { struct scatterlist *sg, *sgl, *next_sg = NULL; u64 start_addr, end_addr; @@ -309,6 +376,14 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data, sgl = data->sg; start_addr = ib_sg_dma_address(ibdev, sgl); + if (unlikely(sgl[0].offset && + data->data_len >= sg_tablesize * PAGE_SIZE)) { + iser_dbg("can't register length %lx with offset %x " + "fall to bounce buffer\n", data->data_len, + sgl[0].offset); + return 0; + } + for_each_sg(sgl, sg, data->dma_nents, i) { if (start_check && !IS_4K_ALIGNED(start_addr)) break; @@ -330,8 +405,11 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data, break; } ret_len = (next_sg) ? i : i+1; - iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", - ret_len, data->dma_nents, data); + + if (unlikely(ret_len != data->dma_nents)) + iser_warn("rdma alignment violation (%d/%d aligned)\n", + ret_len, data->dma_nents); + return ret_len; } @@ -393,7 +471,7 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, { struct scatterlist *sg = mem->sg; - reg->sge.lkey = device->mr->lkey; + reg->sge.lkey = device->pd->local_dma_lkey; reg->rkey = device->mr->rkey; reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); @@ -407,15 +485,12 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, - enum iser_data_dir cmd_dir, - int aligned_len) + enum iser_data_dir cmd_dir) { struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; struct iser_device *device = iser_task->iser_conn->ib_conn.device; iscsi_conn->fmr_unalign_cnt++; - iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", - aligned_len, mem->size); if (iser_debug_level > 0) iser_data_buf_dump(mem, device->ib_device); @@ -439,13 +514,15 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, * returns: 0 on success, errno code on failure */ static -int iser_reg_page_vec(struct iscsi_iser_task *iser_task, +int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, - struct iser_page_vec *page_vec, - struct iser_mem_reg *mem_reg) + struct iser_reg_resources *rsc, + struct iser_mem_reg *reg) { struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; + struct iser_page_vec *page_vec = rsc->page_vec; + struct ib_fmr_pool *fmr_pool = rsc->fmr_pool; struct ib_pool_fmr *fmr; int ret, plen; @@ -461,7 +538,7 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task, return -EINVAL; } - fmr = ib_fmr_pool_map_phys(ib_conn->fmr.pool, + fmr = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages, page_vec->length, page_vec->pages[0]); @@ -471,11 +548,15 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task, return ret; } - mem_reg->sge.lkey = fmr->fmr->lkey; - mem_reg->rkey = fmr->fmr->rkey; - mem_reg->sge.addr = page_vec->pages[0] + page_vec->offset; - mem_reg->sge.length = page_vec->data_size; - mem_reg->mem_h = fmr; + reg->sge.lkey = fmr->fmr->lkey; + reg->rkey = fmr->fmr->rkey; + reg->sge.addr = page_vec->pages[0] + page_vec->offset; + reg->sge.length = page_vec->data_size; + reg->mem_h = fmr; + + iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx," + " length=0x%x\n", reg->sge.lkey, reg->rkey, + reg->sge.addr, reg->sge.length); return 0; } @@ -505,71 +586,17 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir) { + struct iser_device *device = iser_task->iser_conn->ib_conn.device; struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; if (!reg->mem_h) return; - iser_reg_desc_put(&iser_task->iser_conn->ib_conn, - reg->mem_h); + device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn, + reg->mem_h); reg->mem_h = NULL; } -/** - * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, - * using FMR (if possible) obtaining rkey and va - * - * returns 0 on success, errno code on failure - */ -int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) -{ - struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; - struct iser_device *device = ib_conn->device; - struct ib_device *ibdev = device->ib_device; - struct iser_data_buf *mem = &iser_task->data[cmd_dir]; - struct iser_mem_reg *mem_reg; - int aligned_len; - int err; - int i; - - mem_reg = &iser_task->rdma_reg[cmd_dir]; - - aligned_len = iser_data_buf_aligned_len(mem, ibdev); - if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, mem, - cmd_dir, aligned_len); - if (err) { - iser_err("failed to allocate bounce buffer\n"); - return err; - } - } - - /* if there a single dma entry, FMR is not needed */ - if (mem->dma_nents == 1) { - return iser_reg_dma(device, mem, mem_reg); - } else { /* use FMR for multiple dma entries */ - err = iser_reg_page_vec(iser_task, mem, ib_conn->fmr.page_vec, - mem_reg); - if (err && err != -EAGAIN) { - iser_data_buf_dump(mem, ibdev); - iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", - mem->dma_nents, - ntoh24(iser_task->desc.iscsi_header.dlength)); - iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", - ib_conn->fmr.page_vec->data_size, - ib_conn->fmr.page_vec->length, - ib_conn->fmr.page_vec->offset); - for (i = 0; i < ib_conn->fmr.page_vec->length; i++) - iser_err("page_vec[%d] = 0x%llx\n", i, - (unsigned long long)ib_conn->fmr.page_vec->pages[i]); - } - if (err) - return err; - } - return 0; -} - static void iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, struct ib_sig_domain *domain) @@ -637,10 +664,11 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) { u32 rkey; - memset(inv_wr, 0, sizeof(*inv_wr)); inv_wr->opcode = IB_WR_LOCAL_INV; inv_wr->wr_id = ISER_FASTREG_LI_WRID; inv_wr->ex.invalidate_rkey = mr->rkey; + inv_wr->send_flags = 0; + inv_wr->num_sge = 0; rkey = ib_inc_rkey(mr->rkey); ib_update_fast_reg_key(mr, rkey); @@ -648,61 +676,51 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) static int iser_reg_sig_mr(struct iscsi_iser_task *iser_task, - struct fast_reg_descriptor *desc, + struct iser_pi_context *pi_ctx, struct iser_mem_reg *data_reg, struct iser_mem_reg *prot_reg, struct iser_mem_reg *sig_reg) { - struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; - struct iser_pi_context *pi_ctx = desc->pi_ctx; - struct ib_send_wr sig_wr, inv_wr; - struct ib_send_wr *bad_wr, *wr = NULL; - struct ib_sig_attrs sig_attrs; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs; + struct ib_send_wr *wr; int ret; - memset(&sig_attrs, 0, sizeof(sig_attrs)); - ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); + memset(sig_attrs, 0, sizeof(*sig_attrs)); + ret = iser_set_sig_attrs(iser_task->sc, sig_attrs); if (ret) goto err; - iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); + iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); - if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { - iser_inv_rkey(&inv_wr, pi_ctx->sig_mr); - wr = &inv_wr; + if (!pi_ctx->sig_mr_valid) { + wr = iser_tx_next_wr(tx_desc); + iser_inv_rkey(wr, pi_ctx->sig_mr); } - memset(&sig_wr, 0, sizeof(sig_wr)); - sig_wr.opcode = IB_WR_REG_SIG_MR; - sig_wr.wr_id = ISER_FASTREG_LI_WRID; - sig_wr.sg_list = &data_reg->sge; - sig_wr.num_sge = 1; - sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; - sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + wr = iser_tx_next_wr(tx_desc); + wr->opcode = IB_WR_REG_SIG_MR; + wr->wr_id = ISER_FASTREG_LI_WRID; + wr->sg_list = &data_reg->sge; + wr->num_sge = 1; + wr->send_flags = 0; + wr->wr.sig_handover.sig_attrs = sig_attrs; + wr->wr.sig_handover.sig_mr = pi_ctx->sig_mr; if (scsi_prot_sg_count(iser_task->sc)) - sig_wr.wr.sig_handover.prot = &prot_reg->sge; - sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE; - - if (!wr) - wr = &sig_wr; + wr->wr.sig_handover.prot = &prot_reg->sge; else - wr->next = &sig_wr; - - ret = ib_post_send(ib_conn->qp, wr, &bad_wr); - if (ret) { - iser_err("reg_sig_mr failed, ret:%d\n", ret); - goto err; - } - desc->reg_indicators &= ~ISER_SIG_KEY_VALID; + wr->wr.sig_handover.prot = NULL; + wr->wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + pi_ctx->sig_mr_valid = 0; sig_reg->sge.lkey = pi_ctx->sig_mr->lkey; sig_reg->rkey = pi_ctx->sig_mr->rkey; sig_reg->sge.addr = 0; sig_reg->sge.length = scsi_transfer_length(iser_task->sc); - iser_dbg("sig_sge: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n", + iser_dbg("sig reg: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n", sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr, sig_reg->sge.length); err: @@ -711,29 +729,16 @@ err: static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, - struct fast_reg_descriptor *desc, - enum iser_reg_indicator ind, + struct iser_reg_resources *rsc, struct iser_mem_reg *reg) { struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - struct ib_mr *mr; - struct ib_fast_reg_page_list *frpl; - struct ib_send_wr fastreg_wr, inv_wr; - struct ib_send_wr *bad_wr, *wr = NULL; - int ret, offset, size, plen; - - /* if there a single dma entry, dma mr suffices */ - if (mem->dma_nents == 1) - return iser_reg_dma(device, mem, reg); - - if (ind == ISER_DATA_KEY_VALID) { - mr = desc->data_mr; - frpl = desc->data_frpl; - } else { - mr = desc->pi_ctx->prot_mr; - frpl = desc->pi_ctx->prot_frpl; - } + struct ib_mr *mr = rsc->mr; + struct ib_fast_reg_page_list *frpl = rsc->frpl; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct ib_send_wr *wr; + int offset, size, plen; plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list, &offset, &size); @@ -742,118 +747,151 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, return -EINVAL; } - if (!(desc->reg_indicators & ind)) { - iser_inv_rkey(&inv_wr, mr); - wr = &inv_wr; + if (!rsc->mr_valid) { + wr = iser_tx_next_wr(tx_desc); + iser_inv_rkey(wr, mr); } - /* Prepare FASTREG WR */ - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset; - fastreg_wr.wr.fast_reg.page_list = frpl; - fastreg_wr.wr.fast_reg.page_list_len = plen; - fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; - fastreg_wr.wr.fast_reg.length = size; - fastreg_wr.wr.fast_reg.rkey = mr->rkey; - fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - - if (!wr) - wr = &fastreg_wr; - else - wr->next = &fastreg_wr; - - ret = ib_post_send(ib_conn->qp, wr, &bad_wr); - if (ret) { - iser_err("fast registration failed, ret:%d\n", ret); - return ret; - } - desc->reg_indicators &= ~ind; + wr = iser_tx_next_wr(tx_desc); + wr->opcode = IB_WR_FAST_REG_MR; + wr->wr_id = ISER_FASTREG_LI_WRID; + wr->send_flags = 0; + wr->wr.fast_reg.iova_start = frpl->page_list[0] + offset; + wr->wr.fast_reg.page_list = frpl; + wr->wr.fast_reg.page_list_len = plen; + wr->wr.fast_reg.page_shift = SHIFT_4K; + wr->wr.fast_reg.length = size; + wr->wr.fast_reg.rkey = mr->rkey; + wr->wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + rsc->mr_valid = 0; reg->sge.lkey = mr->lkey; reg->rkey = mr->rkey; reg->sge.addr = frpl->page_list[0] + offset; reg->sge.length = size; - return ret; + iser_dbg("fast reg: lkey=0x%x, rkey=0x%x, addr=0x%llx," + " length=0x%x\n", reg->sge.lkey, reg->rkey, + reg->sge.addr, reg->sge.length); + + return 0; } -/** - * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA, - * using Fast Registration WR (if possible) obtaining rkey and va - * - * returns 0 on success, errno code on failure - */ -int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) +static int +iser_handle_unaligned_buf(struct iscsi_iser_task *task, + struct iser_data_buf *mem, + enum iser_data_dir dir) { - struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; - struct iser_device *device = ib_conn->device; - struct ib_device *ibdev = device->ib_device; - struct iser_data_buf *mem = &iser_task->data[cmd_dir]; - struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir]; - struct fast_reg_descriptor *desc = NULL; + struct iser_conn *iser_conn = task->iser_conn; + struct iser_device *device = iser_conn->ib_conn.device; int err, aligned_len; - aligned_len = iser_data_buf_aligned_len(mem, ibdev); + aligned_len = iser_data_buf_aligned_len(mem, device->ib_device, + iser_conn->scsi_sg_tablesize); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, mem, - cmd_dir, aligned_len); - if (err) { - iser_err("failed to allocate bounce buffer\n"); + err = fall_to_bounce_buf(task, mem, dir); + if (err) return err; - } } + return 0; +} + +static int +iser_reg_prot_sg(struct iscsi_iser_task *task, + struct iser_data_buf *mem, + struct iser_fr_desc *desc, + struct iser_mem_reg *reg) +{ + struct iser_device *device = task->iser_conn->ib_conn.device; + + if (mem->dma_nents == 1) + return iser_reg_dma(device, mem, reg); + + return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg); +} + +static int +iser_reg_data_sg(struct iscsi_iser_task *task, + struct iser_data_buf *mem, + struct iser_fr_desc *desc, + struct iser_mem_reg *reg) +{ + struct iser_device *device = task->iser_conn->ib_conn.device; + + if (mem->dma_nents == 1) + return iser_reg_dma(device, mem, reg); + + return device->reg_ops->reg_mem(task, mem, &desc->rsc, reg); +} + +int iser_reg_rdma_mem(struct iscsi_iser_task *task, + enum iser_data_dir dir) +{ + struct ib_conn *ib_conn = &task->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct iser_data_buf *mem = &task->data[dir]; + struct iser_mem_reg *reg = &task->rdma_reg[dir]; + struct iser_mem_reg *data_reg; + struct iser_fr_desc *desc = NULL; + int err; + + err = iser_handle_unaligned_buf(task, mem, dir); + if (unlikely(err)) + return err; + if (mem->dma_nents != 1 || - scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { - desc = iser_reg_desc_get(ib_conn); - mem_reg->mem_h = desc; + scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) { + desc = device->reg_ops->reg_desc_get(ib_conn); + reg->mem_h = desc; } - err = iser_fast_reg_mr(iser_task, mem, desc, - ISER_DATA_KEY_VALID, mem_reg); - if (err) + if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) + data_reg = reg; + else + data_reg = &task->desc.data_reg; + + err = iser_reg_data_sg(task, mem, desc, data_reg); + if (unlikely(err)) goto err_reg; - if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { - struct iser_mem_reg prot_reg; - - memset(&prot_reg, 0, sizeof(prot_reg)); - if (scsi_prot_sg_count(iser_task->sc)) { - mem = &iser_task->prot[cmd_dir]; - aligned_len = iser_data_buf_aligned_len(mem, ibdev); - if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, mem, - cmd_dir, aligned_len); - if (err) { - iser_err("failed to allocate bounce buffer\n"); - return err; - } - } + if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) { + struct iser_mem_reg *prot_reg = &task->desc.prot_reg; - err = iser_fast_reg_mr(iser_task, mem, desc, - ISER_PROT_KEY_VALID, &prot_reg); - if (err) + if (scsi_prot_sg_count(task->sc)) { + mem = &task->prot[dir]; + err = iser_handle_unaligned_buf(task, mem, dir); + if (unlikely(err)) goto err_reg; - } - err = iser_reg_sig_mr(iser_task, desc, mem_reg, - &prot_reg, mem_reg); - if (err) { - iser_err("Failed to register signature mr\n"); - return err; + err = iser_reg_prot_sg(task, mem, desc, prot_reg); + if (unlikely(err)) + goto err_reg; } - desc->reg_indicators |= ISER_FASTREG_PROTECTED; + + err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg, + prot_reg, reg); + if (unlikely(err)) + goto err_reg; + + desc->pi_ctx->sig_protected = 1; } return 0; + err_reg: if (desc) - iser_reg_desc_put(ib_conn, desc); + device->reg_ops->reg_desc_put(ib_conn, desc); return err; } + +void iser_unreg_rdma_mem(struct iscsi_iser_task *task, + enum iser_data_dir dir) +{ + struct iser_device *device = task->iser_conn->ib_conn.device; + + device->reg_ops->unreg_mem(task, dir); +} diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 5c9f565..ae70cc1 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -87,25 +87,9 @@ static int iser_create_device_ib_res(struct iser_device *device) return ret; } - /* Assign function handles - based on FMR support */ - if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && - device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { - iser_info("FMR supported, using FMR for registration\n"); - device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; - device->iser_free_rdma_reg_res = iser_free_fmr_pool; - device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; - device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; - } else - if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - iser_info("FastReg supported, using FastReg for registration\n"); - device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; - device->iser_free_rdma_reg_res = iser_free_fastreg_pool; - device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; - device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; - } else { - iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); - return -1; - } + ret = iser_assign_reg_ops(device); + if (ret) + return ret; device->comps_used = min_t(int, num_online_cpus(), device->ib_device->num_comp_vectors); @@ -201,7 +185,7 @@ static void iser_free_device_ib_res(struct iser_device *device) (void)ib_unregister_event_handler(&device->event_handler); (void)ib_dereg_mr(device->mr); - (void)ib_dealloc_pd(device->pd); + ib_dealloc_pd(device->pd); kfree(device->comps); device->comps = NULL; @@ -211,28 +195,40 @@ static void iser_free_device_ib_res(struct iser_device *device) } /** - * iser_create_fmr_pool - Creates FMR pool and page_vector + * iser_alloc_fmr_pool - Creates FMR pool and page_vector * * returns 0 on success, or errno code on failure */ -int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max) +int iser_alloc_fmr_pool(struct ib_conn *ib_conn, + unsigned cmds_max, + unsigned int size) { struct iser_device *device = ib_conn->device; + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_page_vec *page_vec; + struct iser_fr_desc *desc; + struct ib_fmr_pool *fmr_pool; struct ib_fmr_pool_param params; - int ret = -ENOMEM; + int ret; - ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + - (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), - GFP_KERNEL); - if (!ib_conn->fmr.page_vec) - return ret; + INIT_LIST_HEAD(&fr_pool->list); + spin_lock_init(&fr_pool->lock); + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + return -ENOMEM; - ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); + page_vec = kmalloc(sizeof(*page_vec) + (sizeof(u64) * size), + GFP_KERNEL); + if (!page_vec) { + ret = -ENOMEM; + goto err_frpl; + } + + page_vec->pages = (u64 *)(page_vec + 1); params.page_shift = SHIFT_4K; - /* when the first/last SG element are not start/end * - * page aligned, the map whould be of N+1 pages */ - params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; + params.max_pages_per_fmr = size; /* make the pool size twice the max number of SCSI commands * * the ML is expected to queue, watermark for unmap at 50% */ params.pool_size = cmds_max * 2; @@ -243,23 +239,25 @@ int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max) IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ); - ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); - if (!IS_ERR(ib_conn->fmr.pool)) - return 0; - - /* no FMR => no need for page_vec */ - kfree(ib_conn->fmr.page_vec); - ib_conn->fmr.page_vec = NULL; - - ret = PTR_ERR(ib_conn->fmr.pool); - ib_conn->fmr.pool = NULL; - if (ret != -ENOSYS) { + fmr_pool = ib_create_fmr_pool(device->pd, ¶ms); + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); iser_err("FMR allocation failed, err %d\n", ret); - return ret; - } else { - iser_warn("FMRs are not supported, using unaligned mode\n"); - return 0; + goto err_fmr; } + + desc->rsc.page_vec = page_vec; + desc->rsc.fmr_pool = fmr_pool; + list_add(&desc->list, &fr_pool->list); + + return 0; + +err_fmr: + kfree(page_vec); +err_frpl: + kfree(desc); + + return ret; } /** @@ -267,26 +265,68 @@ int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max) */ void iser_free_fmr_pool(struct ib_conn *ib_conn) { + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_fr_desc *desc; + + desc = list_first_entry(&fr_pool->list, + struct iser_fr_desc, list); + list_del(&desc->list); + iser_info("freeing conn %p fmr pool %p\n", - ib_conn, ib_conn->fmr.pool); + ib_conn, desc->rsc.fmr_pool); + + ib_destroy_fmr_pool(desc->rsc.fmr_pool); + kfree(desc->rsc.page_vec); + kfree(desc); +} + +static int +iser_alloc_reg_res(struct ib_device *ib_device, + struct ib_pd *pd, + struct iser_reg_resources *res, + unsigned int size) +{ + int ret; + + res->frpl = ib_alloc_fast_reg_page_list(ib_device, size); + if (IS_ERR(res->frpl)) { + ret = PTR_ERR(res->frpl); + iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", + ret); + return PTR_ERR(res->frpl); + } + + res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, size); + if (IS_ERR(res->mr)) { + ret = PTR_ERR(res->mr); + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); + goto fast_reg_mr_failure; + } + res->mr_valid = 1; - if (ib_conn->fmr.pool != NULL) - ib_destroy_fmr_pool(ib_conn->fmr.pool); + return 0; + +fast_reg_mr_failure: + ib_free_fast_reg_page_list(res->frpl); - ib_conn->fmr.pool = NULL; + return ret; +} - kfree(ib_conn->fmr.page_vec); - ib_conn->fmr.page_vec = NULL; +static void +iser_free_reg_res(struct iser_reg_resources *rsc) +{ + ib_dereg_mr(rsc->mr); + ib_free_fast_reg_page_list(rsc->frpl); } static int -iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd, - struct fast_reg_descriptor *desc) +iser_alloc_pi_ctx(struct ib_device *ib_device, + struct ib_pd *pd, + struct iser_fr_desc *desc, + unsigned int size) { struct iser_pi_context *pi_ctx = NULL; - struct ib_mr_init_attr mr_init_attr = {.max_reg_descriptors = 2, - .flags = IB_MR_SIGNATURE_EN}; - int ret = 0; + int ret; desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); if (!desc->pi_ctx) @@ -294,36 +334,25 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd, pi_ctx = desc->pi_ctx; - pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(pi_ctx->prot_frpl)) { - ret = PTR_ERR(pi_ctx->prot_frpl); - goto prot_frpl_failure; - } - - pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, - ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(pi_ctx->prot_mr)) { - ret = PTR_ERR(pi_ctx->prot_mr); - goto prot_mr_failure; + ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc, size); + if (ret) { + iser_err("failed to allocate reg_resources\n"); + goto alloc_reg_res_err; } - desc->reg_indicators |= ISER_PROT_KEY_VALID; - pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2); if (IS_ERR(pi_ctx->sig_mr)) { ret = PTR_ERR(pi_ctx->sig_mr); goto sig_mr_failure; } - desc->reg_indicators |= ISER_SIG_KEY_VALID; - desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + pi_ctx->sig_mr_valid = 1; + desc->pi_ctx->sig_protected = 0; return 0; sig_mr_failure: - ib_dereg_mr(desc->pi_ctx->prot_mr); -prot_mr_failure: - ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); -prot_frpl_failure: + iser_free_reg_res(&pi_ctx->rsc); +alloc_reg_res_err: kfree(desc->pi_ctx); return ret; @@ -332,82 +361,71 @@ prot_frpl_failure: static void iser_free_pi_ctx(struct iser_pi_context *pi_ctx) { - ib_free_fast_reg_page_list(pi_ctx->prot_frpl); - ib_dereg_mr(pi_ctx->prot_mr); - ib_destroy_mr(pi_ctx->sig_mr); + iser_free_reg_res(&pi_ctx->rsc); + ib_dereg_mr(pi_ctx->sig_mr); kfree(pi_ctx); } -static int -iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, - bool pi_enable, struct fast_reg_descriptor *desc) +static struct iser_fr_desc * +iser_create_fastreg_desc(struct ib_device *ib_device, + struct ib_pd *pd, + bool pi_enable, + unsigned int size) { + struct iser_fr_desc *desc; int ret; - desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, - ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(desc->data_frpl)) { - ret = PTR_ERR(desc->data_frpl); - iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", - ret); - return PTR_ERR(desc->data_frpl); - } + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + return ERR_PTR(-ENOMEM); - desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(desc->data_mr)) { - ret = PTR_ERR(desc->data_mr); - iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); - goto fast_reg_mr_failure; - } - desc->reg_indicators |= ISER_DATA_KEY_VALID; + ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc, size); + if (ret) + goto reg_res_alloc_failure; if (pi_enable) { - ret = iser_alloc_pi_ctx(ib_device, pd, desc); + ret = iser_alloc_pi_ctx(ib_device, pd, desc, size); if (ret) goto pi_ctx_alloc_failure; } - return 0; + return desc; + pi_ctx_alloc_failure: - ib_dereg_mr(desc->data_mr); -fast_reg_mr_failure: - ib_free_fast_reg_page_list(desc->data_frpl); + iser_free_reg_res(&desc->rsc); +reg_res_alloc_failure: + kfree(desc); - return ret; + return ERR_PTR(ret); } /** - * iser_create_fastreg_pool - Creates pool of fast_reg descriptors + * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors * for fast registration work requests. * returns 0 on success, or errno code on failure */ -int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max) +int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, + unsigned cmds_max, + unsigned int size) { struct iser_device *device = ib_conn->device; - struct fast_reg_descriptor *desc; + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_fr_desc *desc; int i, ret; - INIT_LIST_HEAD(&ib_conn->fastreg.pool); - ib_conn->fastreg.pool_size = 0; + INIT_LIST_HEAD(&fr_pool->list); + spin_lock_init(&fr_pool->lock); + fr_pool->size = 0; for (i = 0; i < cmds_max; i++) { - desc = kzalloc(sizeof(*desc), GFP_KERNEL); - if (!desc) { - iser_err("Failed to allocate a new fast_reg descriptor\n"); - ret = -ENOMEM; - goto err; - } - - ret = iser_create_fastreg_desc(device->ib_device, device->pd, - ib_conn->pi_support, desc); - if (ret) { - iser_err("Failed to create fastreg descriptor err=%d\n", - ret); - kfree(desc); + desc = iser_create_fastreg_desc(device->ib_device, device->pd, + ib_conn->pi_support, size); + if (IS_ERR(desc)) { + ret = PTR_ERR(desc); goto err; } - list_add_tail(&desc->list, &ib_conn->fastreg.pool); - ib_conn->fastreg.pool_size++; + list_add_tail(&desc->list, &fr_pool->list); + fr_pool->size++; } return 0; @@ -422,27 +440,27 @@ err: */ void iser_free_fastreg_pool(struct ib_conn *ib_conn) { - struct fast_reg_descriptor *desc, *tmp; + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_fr_desc *desc, *tmp; int i = 0; - if (list_empty(&ib_conn->fastreg.pool)) + if (list_empty(&fr_pool->list)) return; iser_info("freeing conn %p fr pool\n", ib_conn); - list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { + list_for_each_entry_safe(desc, tmp, &fr_pool->list, list) { list_del(&desc->list); - ib_free_fast_reg_page_list(desc->data_frpl); - ib_dereg_mr(desc->data_mr); + iser_free_reg_res(&desc->rsc); if (desc->pi_ctx) iser_free_pi_ctx(desc->pi_ctx); kfree(desc); ++i; } - if (i < ib_conn->fastreg.pool_size) + if (i < fr_pool->size) iser_warn("pool still has %d regions registered\n", - ib_conn->fastreg.pool_size - i); + fr_pool->size - i); } /** @@ -738,6 +756,31 @@ static void iser_connect_error(struct rdma_cm_id *cma_id) iser_conn->state = ISER_CONN_TERMINATING; } +static void +iser_calc_scsi_params(struct iser_conn *iser_conn, + unsigned int max_sectors) +{ + struct iser_device *device = iser_conn->ib_conn.device; + unsigned short sg_tablesize, sup_sg_tablesize; + + sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K); + sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE, + device->dev_attr.max_fast_reg_page_list_len); + + if (sg_tablesize > sup_sg_tablesize) { + sg_tablesize = sup_sg_tablesize; + iser_conn->scsi_max_sectors = sg_tablesize * SIZE_4K / 512; + } else { + iser_conn->scsi_max_sectors = max_sectors; + } + + iser_conn->scsi_sg_tablesize = sg_tablesize; + + iser_dbg("iser_conn %p, sg_tablesize %u, max_sectors %u\n", + iser_conn, iser_conn->scsi_sg_tablesize, + iser_conn->scsi_max_sectors); +} + /** * Called with state mutex held **/ @@ -776,6 +819,8 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) } } + iser_calc_scsi_params(iser_conn, iser_max_sectors); + ret = rdma_resolve_route(cma_id, 1000); if (ret) { iser_err("resolve route failed: %d\n", ret); @@ -938,7 +983,6 @@ void iser_conn_init(struct iser_conn *iser_conn) init_completion(&iser_conn->ib_completion); init_completion(&iser_conn->up_completion); INIT_LIST_HEAD(&iser_conn->conn_list); - spin_lock_init(&iser_conn->ib_conn.lock); mutex_init(&iser_conn->state_mutex); } @@ -1017,7 +1061,7 @@ int iser_post_recvl(struct iser_conn *iser_conn) sge.addr = iser_conn->login_resp_dma; sge.length = ISER_RX_LOGIN_SIZE; - sge.lkey = ib_conn->device->mr->lkey; + sge.lkey = ib_conn->device->pd->local_dma_lkey; rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; rx_wr.sg_list = &sge; @@ -1072,23 +1116,24 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count) int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, bool signal) { - int ib_ret; - struct ib_send_wr send_wr, *send_wr_failed; + struct ib_send_wr *bad_wr, *wr = iser_tx_next_wr(tx_desc); + int ib_ret; ib_dma_sync_single_for_device(ib_conn->device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); - send_wr.next = NULL; - send_wr.wr_id = (uintptr_t)tx_desc; - send_wr.sg_list = tx_desc->tx_sg; - send_wr.num_sge = tx_desc->num_sge; - send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0; + wr->next = NULL; + wr->wr_id = (uintptr_t)tx_desc; + wr->sg_list = tx_desc->tx_sg; + wr->num_sge = tx_desc->num_sge; + wr->opcode = IB_WR_SEND; + wr->send_flags = signal ? IB_SEND_SIGNALED : 0; - ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); + ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0], &bad_wr); if (ib_ret) - iser_err("ib_post_send failed, ret:%d\n", ib_ret); + iser_err("ib_post_send failed, ret:%d opcode:%d\n", + ib_ret, bad_wr->opcode); return ib_ret; } @@ -1240,13 +1285,13 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector) { struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; - struct fast_reg_descriptor *desc = reg->mem_h; + struct iser_fr_desc *desc = reg->mem_h; unsigned long sector_size = iser_task->sc->device->sector_size; struct ib_mr_status mr_status; int ret; - if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { - desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + if (desc && desc->pi_ctx->sig_protected) { + desc->pi_ctx->sig_protected = 0; ret = ib_check_mr_status(desc->pi_ctx->sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); if (ret) { diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index d851e18..dc439a4 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -235,7 +235,7 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn) rx_sg = &rx_desc->rx_sg; rx_sg->addr = rx_desc->dma_addr; rx_sg->length = ISER_RX_PAYLOAD_SIZE; - rx_sg->lkey = device->mr->lkey; + rx_sg->lkey = device->pd->local_dma_lkey; } isert_conn->rx_desc_head = 0; @@ -385,22 +385,12 @@ isert_create_device_ib_res(struct isert_device *device) goto out_cq; } - device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(device->mr)) { - ret = PTR_ERR(device->mr); - isert_err("failed to create dma mr, device %p, ret=%d\n", - device, ret); - goto out_mr; - } - /* Check signature cap */ device->pi_capable = dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER ? true : false; return 0; -out_mr: - ib_dealloc_pd(device->pd); out_cq: isert_free_comps(device); return ret; @@ -411,7 +401,6 @@ isert_free_device_ib_res(struct isert_device *device) { isert_info("device %p\n", device); - ib_dereg_mr(device->mr); ib_dealloc_pd(device->pd); isert_free_comps(device); } @@ -491,7 +480,7 @@ isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) if (fr_desc->pi_ctx) { ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); ib_dereg_mr(fr_desc->pi_ctx->prot_mr); - ib_destroy_mr(fr_desc->pi_ctx->sig_mr); + ib_dereg_mr(fr_desc->pi_ctx->sig_mr); kfree(fr_desc->pi_ctx); } kfree(fr_desc); @@ -508,7 +497,6 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc, struct ib_device *device, struct ib_pd *pd) { - struct ib_mr_init_attr mr_init_attr; struct pi_context *pi_ctx; int ret; @@ -527,7 +515,8 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc, goto err_pi_ctx; } - pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, + ISCSI_ISER_SG_TABLESIZE); if (IS_ERR(pi_ctx->prot_mr)) { isert_err("Failed to allocate prot frmr err=%ld\n", PTR_ERR(pi_ctx->prot_mr)); @@ -536,10 +525,7 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc, } desc->ind |= ISERT_PROT_KEY_VALID; - memset(&mr_init_attr, 0, sizeof(mr_init_attr)); - mr_init_attr.max_reg_descriptors = 2; - mr_init_attr.flags |= IB_MR_SIGNATURE_EN; - pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2); if (IS_ERR(pi_ctx->sig_mr)) { isert_err("Failed to allocate signature enabled mr err=%ld\n", PTR_ERR(pi_ctx->sig_mr)); @@ -577,7 +563,8 @@ isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, return PTR_ERR(fr_desc->data_frpl); } - fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, + ISCSI_ISER_SG_TABLESIZE); if (IS_ERR(fr_desc->data_mr)) { isert_err("Failed to allocate data frmr err=%ld\n", PTR_ERR(fr_desc->data_mr)); @@ -1092,8 +1079,8 @@ isert_create_send_desc(struct isert_conn *isert_conn, tx_desc->num_sge = 1; tx_desc->isert_cmd = isert_cmd; - if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { - tx_desc->tx_sg[0].lkey = device->mr->lkey; + if (tx_desc->tx_sg[0].lkey != device->pd->local_dma_lkey) { + tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey; isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc); } } @@ -1116,7 +1103,7 @@ isert_init_tx_hdrs(struct isert_conn *isert_conn, tx_desc->dma_addr = dma_addr; tx_desc->tx_sg[0].addr = tx_desc->dma_addr; tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; - tx_desc->tx_sg[0].lkey = device->mr->lkey; + tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey; isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n", tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length, @@ -1149,7 +1136,7 @@ isert_rdma_post_recvl(struct isert_conn *isert_conn) memset(&sge, 0, sizeof(struct ib_sge)); sge.addr = isert_conn->login_req_dma; sge.length = ISER_RX_LOGIN_SIZE; - sge.lkey = isert_conn->device->mr->lkey; + sge.lkey = isert_conn->device->pd->local_dma_lkey; isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n", sge.addr, sge.length, sge.lkey); @@ -1199,7 +1186,7 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, tx_dsg->addr = isert_conn->login_rsp_dma; tx_dsg->length = length; - tx_dsg->lkey = isert_conn->device->mr->lkey; + tx_dsg->lkey = isert_conn->device->pd->local_dma_lkey; tx_desc->num_sge = 2; } if (!login->login_failed) { @@ -2216,7 +2203,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_cmd->pdu_buf_len = pdu_len; tx_dsg->addr = isert_cmd->pdu_buf_dma; tx_dsg->length = pdu_len; - tx_dsg->lkey = device->mr->lkey; + tx_dsg->lkey = device->pd->local_dma_lkey; isert_cmd->tx_desc.num_sge = 2; } @@ -2344,7 +2331,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; tx_dsg->addr = isert_cmd->pdu_buf_dma; tx_dsg->length = ISCSI_HDR_LEN; - tx_dsg->lkey = device->mr->lkey; + tx_dsg->lkey = device->pd->local_dma_lkey; isert_cmd->tx_desc.num_sge = 2; isert_init_send_wr(isert_conn, isert_cmd, send_wr); @@ -2385,7 +2372,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) isert_cmd->pdu_buf_len = txt_rsp_len; tx_dsg->addr = isert_cmd->pdu_buf_dma; tx_dsg->length = txt_rsp_len; - tx_dsg->lkey = device->mr->lkey; + tx_dsg->lkey = device->pd->local_dma_lkey; isert_cmd->tx_desc.num_sge = 2; } isert_init_send_wr(isert_conn, isert_cmd, send_wr); @@ -2426,7 +2413,7 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; ib_sge->length = min_t(u32, data_left, ib_sg_dma_len(ib_dev, tmp_sg) - page_off); - ib_sge->lkey = device->mr->lkey; + ib_sge->lkey = device->pd->local_dma_lkey; isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n", ib_sge->addr, ib_sge->length, ib_sge->lkey); @@ -2600,7 +2587,7 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, u32 page_off; if (mem->dma_nents == 1) { - sge->lkey = device->mr->lkey; + sge->lkey = device->pd->local_dma_lkey; sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 9ec23a78..6a04ba3 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -209,7 +209,6 @@ struct isert_device { int refcount; struct ib_device *ib_device; struct ib_pd *pd; - struct ib_mr *mr; struct isert_comp *comps; int comps_used; struct list_head dev_node; diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 31a20b4..b481490 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -55,8 +55,8 @@ #define DRV_NAME "ib_srp" #define PFX DRV_NAME ": " -#define DRV_VERSION "1.0" -#define DRV_RELDATE "July 1, 2013" +#define DRV_VERSION "2.0" +#define DRV_RELDATE "July 26, 2015" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator"); @@ -68,8 +68,8 @@ static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; static unsigned int indirect_sg_entries; static bool allow_ext_sg; -static bool prefer_fr; -static bool register_always; +static bool prefer_fr = true; +static bool register_always = true; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -131,7 +131,7 @@ MODULE_PARM_DESC(ch_count, "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA."); static void srp_add_one(struct ib_device *device); -static void srp_remove_one(struct ib_device *device); +static void srp_remove_one(struct ib_device *device, void *client_data); static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr); static void srp_send_completion(struct ib_cq *cq, void *ch_ptr); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); @@ -378,7 +378,8 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, INIT_LIST_HEAD(&pool->free_list); for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { - mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); + mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, + max_page_list_len); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto destroy_pool; @@ -545,7 +546,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) if (ret) goto err_qp; - if (dev->use_fast_reg && dev->has_fr) { + if (dev->use_fast_reg) { fr_pool = srp_alloc_fr_pool(target); if (IS_ERR(fr_pool)) { ret = PTR_ERR(fr_pool); @@ -553,10 +554,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) "FR pool allocation failed (%d)\n", ret); goto err_qp; } - if (ch->fr_pool) - srp_destroy_fr_pool(ch->fr_pool); - ch->fr_pool = fr_pool; - } else if (!dev->use_fast_reg && dev->has_fmr) { + } else if (dev->use_fmr) { fmr_pool = srp_alloc_fmr_pool(target); if (IS_ERR(fmr_pool)) { ret = PTR_ERR(fmr_pool); @@ -564,9 +562,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) "FMR pool allocation failed (%d)\n", ret); goto err_qp; } - if (ch->fmr_pool) - ib_destroy_fmr_pool(ch->fmr_pool); - ch->fmr_pool = fmr_pool; } if (ch->qp) @@ -580,6 +575,16 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) ch->recv_cq = recv_cq; ch->send_cq = send_cq; + if (dev->use_fast_reg) { + if (ch->fr_pool) + srp_destroy_fr_pool(ch->fr_pool); + ch->fr_pool = fr_pool; + } else if (dev->use_fmr) { + if (ch->fmr_pool) + ib_destroy_fmr_pool(ch->fmr_pool); + ch->fmr_pool = fmr_pool; + } + kfree(init_attr); return 0; @@ -622,7 +627,7 @@ static void srp_free_ch_ib(struct srp_target_port *target, if (dev->use_fast_reg) { if (ch->fr_pool) srp_destroy_fr_pool(ch->fr_pool); - } else { + } else if (dev->use_fmr) { if (ch->fmr_pool) ib_destroy_fmr_pool(ch->fmr_pool); } @@ -1084,7 +1089,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, if (req->nmdesc) srp_fr_pool_put(ch->fr_pool, req->fr_list, req->nmdesc); - } else { + } else if (dev->use_fmr) { struct ib_pool_fmr **pfmr; for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) @@ -1259,6 +1264,8 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, { struct srp_direct_buf *desc = state->desc; + WARN_ON_ONCE(!dma_len); + desc->va = cpu_to_be64(dma_addr); desc->key = cpu_to_be32(rkey); desc->len = cpu_to_be32(dma_len); @@ -1271,18 +1278,24 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, static int srp_map_finish_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch) { + struct srp_target_port *target = ch->target; + struct srp_device *dev = target->srp_host->srp_dev; struct ib_pool_fmr *fmr; u64 io_addr = 0; + if (state->fmr.next >= state->fmr.end) + return -ENOMEM; + fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages, state->npages, io_addr); if (IS_ERR(fmr)) return PTR_ERR(fmr); - *state->next_fmr++ = fmr; + *state->fmr.next++ = fmr; state->nmdesc++; - srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); + srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask, + state->dma_len, fmr->fmr->rkey); return 0; } @@ -1297,6 +1310,9 @@ static int srp_map_finish_fr(struct srp_map_state *state, struct srp_fr_desc *desc; u32 rkey; + if (state->fr.next >= state->fr.end) + return -ENOMEM; + desc = srp_fr_pool_get(ch->fr_pool); if (!desc) return -ENOMEM; @@ -1320,7 +1336,7 @@ static int srp_map_finish_fr(struct srp_map_state *state, IB_ACCESS_REMOTE_WRITE); wr.wr.fast_reg.rkey = desc->mr->lkey; - *state->next_fr++ = desc; + *state->fr.next++ = desc; state->nmdesc++; srp_map_desc(state, state->base_dma_addr, state->dma_len, @@ -1333,17 +1349,19 @@ static int srp_finish_mapping(struct srp_map_state *state, struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; + struct srp_device *dev = target->srp_host->srp_dev; int ret = 0; + WARN_ON_ONCE(!dev->use_fast_reg && !dev->use_fmr); + if (state->npages == 0) return 0; - if (state->npages == 1 && !register_always) + if (state->npages == 1 && target->global_mr) srp_map_desc(state, state->base_dma_addr, state->dma_len, - target->rkey); + target->global_mr->rkey); else - ret = target->srp_host->srp_dev->use_fast_reg ? - srp_map_finish_fr(state, ch) : + ret = dev->use_fast_reg ? srp_map_finish_fr(state, ch) : srp_map_finish_fmr(state, ch); if (ret == 0) { @@ -1354,66 +1372,19 @@ static int srp_finish_mapping(struct srp_map_state *state, return ret; } -static void srp_map_update_start(struct srp_map_state *state, - struct scatterlist *sg, int sg_index, - dma_addr_t dma_addr) -{ - state->unmapped_sg = sg; - state->unmapped_index = sg_index; - state->unmapped_addr = dma_addr; -} - static int srp_map_sg_entry(struct srp_map_state *state, struct srp_rdma_ch *ch, - struct scatterlist *sg, int sg_index, - bool use_mr) + struct scatterlist *sg, int sg_index) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - unsigned int len; + unsigned int len = 0; int ret; - if (!dma_len) - return 0; - - if (!use_mr) { - /* - * Once we're in direct map mode for a request, we don't - * go back to FMR or FR mode, so no need to update anything - * other than the descriptor. - */ - srp_map_desc(state, dma_addr, dma_len, target->rkey); - return 0; - } - - /* - * Since not all RDMA HW drivers support non-zero page offsets for - * FMR, if we start at an offset into a page, don't merge into the - * current FMR mapping. Finish it out, and use the kernel's MR for - * this sg entry. - */ - if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) || - dma_len > dev->mr_max_size) { - ret = srp_finish_mapping(state, ch); - if (ret) - return ret; - - srp_map_desc(state, dma_addr, dma_len, target->rkey); - srp_map_update_start(state, NULL, 0, 0); - return 0; - } - - /* - * If this is the first sg that will be mapped via FMR or via FR, save - * our position. We need to know the first unmapped entry, its index, - * and the first unmapped address within that entry to be able to - * restart mapping after an error. - */ - if (!state->unmapped_sg) - srp_map_update_start(state, sg, sg_index, dma_addr); + WARN_ON_ONCE(!dma_len); while (dma_len) { unsigned offset = dma_addr & ~dev->mr_page_mask; @@ -1421,8 +1392,6 @@ static int srp_map_sg_entry(struct srp_map_state *state, ret = srp_finish_mapping(state, ch); if (ret) return ret; - - srp_map_update_start(state, sg, sg_index, dma_addr); } len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); @@ -1441,11 +1410,8 @@ static int srp_map_sg_entry(struct srp_map_state *state, * boundries. */ ret = 0; - if (len != dev->mr_page_size) { + if (len != dev->mr_page_size) ret = srp_finish_mapping(state, ch); - if (!ret) - srp_map_update_start(state, NULL, 0, 0); - } return ret; } @@ -1455,50 +1421,80 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch, { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; - struct ib_device *ibdev = dev->dev; struct scatterlist *sg; - int i; - bool use_mr; + int i, ret; state->desc = req->indirect_desc; state->pages = req->map_page; if (dev->use_fast_reg) { - state->next_fr = req->fr_list; - use_mr = !!ch->fr_pool; - } else { - state->next_fmr = req->fmr_list; - use_mr = !!ch->fmr_pool; + state->fr.next = req->fr_list; + state->fr.end = req->fr_list + target->cmd_sg_cnt; + } else if (dev->use_fmr) { + state->fmr.next = req->fmr_list; + state->fmr.end = req->fmr_list + target->cmd_sg_cnt; } - for_each_sg(scat, sg, count, i) { - if (srp_map_sg_entry(state, ch, sg, i, use_mr)) { - /* - * Memory registration failed, so backtrack to the - * first unmapped entry and continue on without using - * memory registration. - */ - dma_addr_t dma_addr; - unsigned int dma_len; - -backtrack: - sg = state->unmapped_sg; - i = state->unmapped_index; - - dma_addr = ib_sg_dma_address(ibdev, sg); - dma_len = ib_sg_dma_len(ibdev, sg); - dma_len -= (state->unmapped_addr - dma_addr); - dma_addr = state->unmapped_addr; - use_mr = false; - srp_map_desc(state, dma_addr, dma_len, target->rkey); + if (dev->use_fast_reg || dev->use_fmr) { + for_each_sg(scat, sg, count, i) { + ret = srp_map_sg_entry(state, ch, sg, i); + if (ret) + goto out; + } + ret = srp_finish_mapping(state, ch); + if (ret) + goto out; + } else { + for_each_sg(scat, sg, count, i) { + srp_map_desc(state, ib_sg_dma_address(dev->dev, sg), + ib_sg_dma_len(dev->dev, sg), + target->global_mr->rkey); } } - if (use_mr && srp_finish_mapping(state, ch)) - goto backtrack; - req->nmdesc = state->nmdesc; + ret = 0; - return 0; +out: + return ret; +} + +/* + * Register the indirect data buffer descriptor with the HCA. + * + * Note: since the indirect data buffer descriptor has been allocated with + * kmalloc() it is guaranteed that this buffer is a physically contiguous + * memory buffer. + */ +static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, + void **next_mr, void **end_mr, u32 idb_len, + __be32 *idb_rkey) +{ + struct srp_target_port *target = ch->target; + struct srp_device *dev = target->srp_host->srp_dev; + struct srp_map_state state; + struct srp_direct_buf idb_desc; + u64 idb_pages[1]; + int ret; + + memset(&state, 0, sizeof(state)); + memset(&idb_desc, 0, sizeof(idb_desc)); + state.gen.next = next_mr; + state.gen.end = end_mr; + state.desc = &idb_desc; + state.pages = idb_pages; + state.pages[0] = (req->indirect_dma_addr & + dev->mr_page_mask); + state.npages = 1; + state.base_dma_addr = req->indirect_dma_addr; + state.dma_len = idb_len; + ret = srp_finish_mapping(&state, ch); + if (ret < 0) + goto out; + + *idb_rkey = idb_desc.key; + +out: + return ret; } static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, @@ -1507,12 +1503,13 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, struct srp_target_port *target = ch->target; struct scatterlist *scat; struct srp_cmd *cmd = req->cmd->buf; - int len, nents, count; + int len, nents, count, ret; struct srp_device *dev; struct ib_device *ibdev; struct srp_map_state state; struct srp_indirect_buf *indirect_hdr; - u32 table_len; + u32 idb_len, table_len; + __be32 idb_rkey; u8 fmt; if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) @@ -1539,7 +1536,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, fmt = SRP_DATA_DESC_DIRECT; len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); - if (count == 1 && !register_always) { + if (count == 1 && target->global_mr) { /* * The midlayer only generated a single gather/scatter * entry, or DMA mapping coalesced everything to a @@ -1549,7 +1546,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, struct srp_direct_buf *buf = (void *) cmd->add_data; buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); - buf->key = cpu_to_be32(target->rkey); + buf->key = cpu_to_be32(target->global_mr->rkey); buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); req->nmdesc = 0; @@ -1594,6 +1591,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, count = min(state.ndesc, target->cmd_sg_cnt); table_len = state.ndesc * sizeof (struct srp_direct_buf); + idb_len = sizeof(struct srp_indirect_buf) + table_len; fmt = SRP_DATA_DESC_INDIRECT; len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); @@ -1602,8 +1600,18 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, memcpy(indirect_hdr->desc_list, req->indirect_desc, count * sizeof (struct srp_direct_buf)); + if (!target->global_mr) { + ret = srp_map_idb(ch, req, state.gen.next, state.gen.end, + idb_len, &idb_rkey); + if (ret < 0) + return ret; + req->nmdesc++; + } else { + idb_rkey = target->global_mr->rkey; + } + indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); - indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); + indirect_hdr->table_desc.key = idb_rkey; indirect_hdr->table_desc.len = cpu_to_be32(table_len); indirect_hdr->len = cpu_to_be32(state.total_len); @@ -2171,7 +2179,7 @@ static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) } static void srp_cm_rep_handler(struct ib_cm_id *cm_id, - struct srp_login_rsp *lrsp, + const struct srp_login_rsp *lrsp, struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; @@ -2757,6 +2765,13 @@ static int srp_sdev_count(struct Scsi_Host *host) return c; } +/* + * Return values: + * < 0 upon failure. Caller is responsible for SRP target port cleanup. + * 0 and target->state == SRP_TARGET_REMOVED if asynchronous target port + * removal has been scheduled. + * 0 and target->state != SRP_TARGET_REMOVED upon success. + */ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) { struct srp_rport_identifiers ids; @@ -3146,8 +3161,8 @@ static ssize_t srp_create_target(struct device *dev, target->io_class = SRP_REV16A_IB_IO_CLASS; target->scsi_host = target_host; target->srp_host = host; - target->lkey = host->srp_dev->mr->lkey; - target->rkey = host->srp_dev->mr->rkey; + target->lkey = host->srp_dev->pd->local_dma_lkey; + target->global_mr = host->srp_dev->global_mr; target->cmd_sg_cnt = cmd_sg_entries; target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; target->allow_ext_sg = allow_ext_sg; @@ -3262,7 +3277,7 @@ static ssize_t srp_create_target(struct device *dev, srp_free_ch_ib(target, ch); srp_free_req_data(target, ch); target->ch_count = ch - target->ch; - break; + goto connected; } } @@ -3272,6 +3287,7 @@ static ssize_t srp_create_target(struct device *dev, node_idx++; } +connected: target->scsi_host->nr_hw_queues = target->ch_count; ret = srp_add_target(host, target); @@ -3294,6 +3310,8 @@ out: mutex_unlock(&host->add_target_mutex); scsi_host_put(target->scsi_host); + if (ret < 0) + scsi_host_put(target->scsi_host); return ret; @@ -3401,6 +3419,7 @@ static void srp_add_one(struct ib_device *device) srp_dev->use_fast_reg = (srp_dev->has_fr && (!srp_dev->has_fmr || prefer_fr)); + srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; /* * Use the smallest page size supported by the HCA, down to a @@ -3433,12 +3452,16 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->pd)) goto free_dev; - srp_dev->mr = ib_get_dma_mr(srp_dev->pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(srp_dev->mr)) - goto err_pd; + if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) { + srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(srp_dev->global_mr)) + goto err_pd; + } else { + srp_dev->global_mr = NULL; + } for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { host = srp_add_port(srp_dev, p); @@ -3460,13 +3483,13 @@ free_attr: kfree(dev_attr); } -static void srp_remove_one(struct ib_device *device) +static void srp_remove_one(struct ib_device *device, void *client_data) { struct srp_device *srp_dev; struct srp_host *host, *tmp_host; struct srp_target_port *target; - srp_dev = ib_get_client_data(device, &srp_client); + srp_dev = client_data; if (!srp_dev) return; @@ -3495,7 +3518,8 @@ static void srp_remove_one(struct ib_device *device) kfree(host); } - ib_dereg_mr(srp_dev->mr); + if (srp_dev->global_mr) + ib_dereg_mr(srp_dev->global_mr); ib_dealloc_pd(srp_dev->pd); kfree(srp_dev); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 17ee3f8..3608f2e 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -95,13 +95,14 @@ struct srp_device { struct list_head dev_list; struct ib_device *dev; struct ib_pd *pd; - struct ib_mr *mr; + struct ib_mr *global_mr; u64 mr_page_mask; int mr_page_size; int mr_max_size; int max_pages_per_mr; bool has_fmr; bool has_fr; + bool use_fmr; bool use_fast_reg; }; @@ -182,10 +183,10 @@ struct srp_target_port { spinlock_t lock; /* read only in the hot path */ + struct ib_mr *global_mr; struct srp_rdma_ch *ch; u32 ch_count; u32 lkey; - u32 rkey; enum srp_target_state state; unsigned int max_iu_len; unsigned int cmd_sg_cnt; @@ -276,14 +277,21 @@ struct srp_fr_pool { * @npages: Number of page addresses in the pages[] array. * @nmdesc: Number of FMR or FR memory descriptors used for mapping. * @ndesc: Number of SRP buffer descriptors that have been filled in. - * @unmapped_sg: First element of the sg-list that is mapped via FMR or FR. - * @unmapped_index: Index of the first element mapped via FMR or FR. - * @unmapped_addr: DMA address of the first element mapped via FMR or FR. */ struct srp_map_state { union { - struct ib_pool_fmr **next_fmr; - struct srp_fr_desc **next_fr; + struct { + struct ib_pool_fmr **next; + struct ib_pool_fmr **end; + } fmr; + struct { + struct srp_fr_desc **next; + struct srp_fr_desc **end; + } fr; + struct { + void **next; + void **end; + } gen; }; struct srp_direct_buf *desc; u64 *pages; @@ -293,9 +301,6 @@ struct srp_map_state { unsigned int npages; unsigned int nmdesc; unsigned int ndesc; - struct scatterlist *unmapped_sg; - int unmapped_index; - dma_addr_t unmapped_addr; }; #endif /* IB_SRP_H */ diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 60ff0a2..f6fe041 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -783,7 +783,7 @@ static int srpt_post_recv(struct srpt_device *sdev, list.addr = ioctx->ioctx.dma; list.length = srp_max_req_size; - list.lkey = sdev->mr->lkey; + list.lkey = sdev->pd->local_dma_lkey; wr.next = NULL; wr.sg_list = &list; @@ -818,7 +818,7 @@ static int srpt_post_send(struct srpt_rdma_ch *ch, list.addr = ioctx->ioctx.dma; list.length = len; - list.lkey = sdev->mr->lkey; + list.lkey = sdev->pd->local_dma_lkey; wr.next = NULL; wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index); @@ -1206,7 +1206,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, while (rsize > 0 && tsize > 0) { sge->addr = dma_addr; - sge->lkey = ch->sport->sdev->mr->lkey; + sge->lkey = ch->sport->sdev->pd->local_dma_lkey; if (rsize >= dma_len) { sge->length = @@ -3211,10 +3211,6 @@ static void srpt_add_one(struct ib_device *device) if (IS_ERR(sdev->pd)) goto free_dev; - sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(sdev->mr)) - goto err_pd; - sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr); srq_attr.event_handler = srpt_srq_event; @@ -3226,7 +3222,7 @@ static void srpt_add_one(struct ib_device *device) sdev->srq = ib_create_srq(sdev->pd, &srq_attr); if (IS_ERR(sdev->srq)) - goto err_mr; + goto err_pd; pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n", __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr, @@ -3250,7 +3246,7 @@ static void srpt_add_one(struct ib_device *device) * in the system as service_id; therefore, the target_id will change * if this HCA is gone bad and replaced by different HCA */ - if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL)) + if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0)) goto err_cm; INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, @@ -3311,8 +3307,6 @@ err_cm: ib_destroy_cm_id(sdev->cm_id); err_srq: ib_destroy_srq(sdev->srq); -err_mr: - ib_dereg_mr(sdev->mr); err_pd: ib_dealloc_pd(sdev->pd); free_dev: @@ -3326,12 +3320,11 @@ err: /** * srpt_remove_one() - InfiniBand device removal callback function. */ -static void srpt_remove_one(struct ib_device *device) +static void srpt_remove_one(struct ib_device *device, void *client_data) { - struct srpt_device *sdev; + struct srpt_device *sdev = client_data; int i; - sdev = ib_get_client_data(device, &srpt_client); if (!sdev) { pr_info("%s(%s): nothing to do.\n", __func__, device->name); return; @@ -3358,7 +3351,6 @@ static void srpt_remove_one(struct ib_device *device) srpt_release_sdev(sdev); ib_destroy_srq(sdev->srq); - ib_dereg_mr(sdev->mr); ib_dealloc_pd(sdev->pd); srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 21f8df6..5faad8ac 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -393,7 +393,6 @@ struct srpt_port { struct srpt_device { struct ib_device *device; struct ib_pd *pd; - struct ib_mr *mr; struct ib_srq *srq; struct ib_cm_id *cm_id; struct ib_device_attr dev_attr; |