summaryrefslogtreecommitdiffstats
path: root/sys/ofed/drivers/infiniband
diff options
context:
space:
mode:
authorhselasky <hselasky@FreeBSD.org>2015-02-17 08:40:27 +0000
committerhselasky <hselasky@FreeBSD.org>2015-02-17 08:40:27 +0000
commit5dbc43f9a598a3032616939f59ebc1ab4ff295fd (patch)
tree64746faea5ee1c213d909e22c077f2b289427a99 /sys/ofed/drivers/infiniband
parent49152454e7ff6ebf5ddf6533f15ed3d75f1d3d41 (diff)
downloadFreeBSD-src-5dbc43f9a598a3032616939f59ebc1ab4ff295fd.zip
FreeBSD-src-5dbc43f9a598a3032616939f59ebc1ab4ff295fd.tar.gz
Update the infiniband stack to Mellanox's OFED version 2.1.
Highlights: - Multiple verbs API updates - Support for RoCE, RDMA over ethernet All hardware drivers depending on the common infiniband stack has been updated aswell. Discussed with: np @ Sponsored by: Mellanox Technologies MFC after: 1 month
Diffstat (limited to 'sys/ofed/drivers/infiniband')
-rw-r--r--sys/ofed/drivers/infiniband/core/Makefile32
-rw-r--r--sys/ofed/drivers/infiniband/core/addr.c278
-rw-r--r--sys/ofed/drivers/infiniband/core/cache.c117
-rw-r--r--sys/ofed/drivers/infiniband/core/cm.c283
-rw-r--r--sys/ofed/drivers/infiniband/core/cm_msgs.h45
-rw-r--r--sys/ofed/drivers/infiniband/core/cma.c1242
-rw-r--r--sys/ofed/drivers/infiniband/core/core_priv.h3
-rw-r--r--sys/ofed/drivers/infiniband/core/device.c65
-rw-r--r--sys/ofed/drivers/infiniband/core/fmr_pool.c3
-rw-r--r--sys/ofed/drivers/infiniband/core/iwcm.c33
-rw-r--r--sys/ofed/drivers/infiniband/core/local_sa.c1273
-rw-r--r--sys/ofed/drivers/infiniband/core/mad.c755
-rw-r--r--sys/ofed/drivers/infiniband/core/mad_priv.h35
-rw-r--r--sys/ofed/drivers/infiniband/core/mad_rmpp.c2
-rw-r--r--sys/ofed/drivers/infiniband/core/multicast.c60
-rw-r--r--sys/ofed/drivers/infiniband/core/notice.c749
-rw-r--r--sys/ofed/drivers/infiniband/core/packer.c1
-rw-r--r--sys/ofed/drivers/infiniband/core/peer_mem.c461
-rw-r--r--sys/ofed/drivers/infiniband/core/sa.h39
-rw-r--r--sys/ofed/drivers/infiniband/core/sa_query.c412
-rw-r--r--sys/ofed/drivers/infiniband/core/smi.c8
-rw-r--r--sys/ofed/drivers/infiniband/core/sysfs.c129
-rw-r--r--sys/ofed/drivers/infiniband/core/ucm.c62
-rw-r--r--sys/ofed/drivers/infiniband/core/ucma.c188
-rw-r--r--sys/ofed/drivers/infiniband/core/ud_header.c63
-rw-r--r--sys/ofed/drivers/infiniband/core/umem.c559
-rw-r--r--sys/ofed/drivers/infiniband/core/user_mad.c237
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs.h95
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_cmd.c2848
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_main.c758
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_marshall.c5
-rw-r--r--sys/ofed/drivers/infiniband/core/verbs.c663
-rw-r--r--sys/ofed/drivers/infiniband/debug/memtrack.c658
-rw-r--r--sys/ofed/drivers/infiniband/debug/memtrack.h71
-rw-r--r--sys/ofed/drivers/infiniband/debug/mtrack.h778
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/Makefile31
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/ah.c24
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c23
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/cm.c60
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/cq.c140
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c4
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mad.c98
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/main.c1544
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mcg.c32
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c116
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h46
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h76
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mr.c248
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/qp.c368
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c4
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c50
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c3
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/Makefile11
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h2
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c4
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c1
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c8
57 files changed, 9664 insertions, 6239 deletions
diff --git a/sys/ofed/drivers/infiniband/core/Makefile b/sys/ofed/drivers/infiniband/core/Makefile
deleted file mode 100644
index f646040..0000000
--- a/sys/ofed/drivers/infiniband/core/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o
-user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o
-
-obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \
- ib_cm.o iw_cm.o $(infiniband-y)
-obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
-obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
- $(user_access-y)
-
-ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
- device.o fmr_pool.o cache.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
-
-ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
-
-ib_sa-y := sa_query.o multicast.o notice.o local_sa.o
-
-ib_cm-y := cm.o
-
-iw_cm-y := iwcm.o
-
-rdma_cm-y := cma.o
-
-rdma_ucm-y := ucma.o
-
-ib_addr-y := addr.o
-
-ib_umad-y := user_mad.o
-
-ib_ucm-y := ucm.o
-
-ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/sys/ofed/drivers/infiniband/core/addr.c b/sys/ofed/drivers/infiniband/core/addr.c
index c3d5b4f..e85b554 100644
--- a/sys/ofed/drivers/infiniband/core/addr.c
+++ b/sys/ofed/drivers/infiniband/core/addr.c
@@ -69,6 +69,7 @@ static LIST_HEAD(req_list);
static struct delayed_work work;
static struct workqueue_struct *addr_wq;
+static struct rdma_addr_client self;
void rdma_addr_register_client(struct rdma_addr_client *client)
{
atomic_set(&client->refcount, 1);
@@ -89,19 +90,6 @@ void rdma_addr_unregister_client(struct rdma_addr_client *client)
}
EXPORT_SYMBOL(rdma_addr_unregister_client);
-#ifdef __linux__
-int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
- const unsigned char *dst_dev_addr)
-{
- dev_addr->dev_type = dev->type;
- memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
- memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
- if (dst_dev_addr)
- memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
- dev_addr->bound_dev_if = dev->ifindex;
- return 0;
-}
-#else
int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
const unsigned char *dst_dev_addr)
{
@@ -119,10 +107,10 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
dev_addr->bound_dev_if = dev->if_index;
return 0;
}
-#endif
EXPORT_SYMBOL(rdma_copy_addr);
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+ u16 *vlan_id)
{
struct net_device *dev;
int ret = -EADDRNOTAVAIL;
@@ -137,33 +125,21 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
}
switch (addr->sa_family) {
-#ifdef INET
case AF_INET:
- dev = ip_dev_find(NULL,
+ dev = ip_dev_find(&init_net,
((struct sockaddr_in *) addr)->sin_addr.s_addr);
if (!dev)
return ret;
ret = rdma_copy_addr(dev_addr, dev, NULL);
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(dev);
dev_put(dev);
break;
-#endif
#if defined(INET6)
case AF_INET6:
-#ifdef __linux__
- read_lock(&dev_base_lock);
- for_each_netdev(&init_net, dev) {
- if (ipv6_chk_addr(&init_net,
- &((struct sockaddr_in6 *) addr)->sin6_addr,
- dev, 1)) {
- ret = rdma_copy_addr(dev_addr, dev, NULL);
- break;
- }
- }
- read_unlock(&dev_base_lock);
-#else
{
struct sockaddr_in6 *sin6;
struct ifaddr *ifa;
@@ -179,12 +155,12 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
break;
}
ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL);
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp);
ifa_free(ifa);
break;
}
#endif
- break;
-#endif
}
return ret;
}
@@ -218,127 +194,6 @@ static void queue_req(struct addr_req *req)
mutex_unlock(&lock);
}
-#ifdef __linux__
-static int addr4_resolve(struct sockaddr_in *src_in,
- struct sockaddr_in *dst_in,
- struct rdma_dev_addr *addr)
-{
- __be32 src_ip = src_in->sin_addr.s_addr;
- __be32 dst_ip = dst_in->sin_addr.s_addr;
- struct flowi fl;
- struct rtable *rt;
- struct neighbour *neigh;
- int ret;
-
- memset(&fl, 0, sizeof fl);
- fl.nl_u.ip4_u.daddr = dst_ip;
- fl.nl_u.ip4_u.saddr = src_ip;
- fl.oif = addr->bound_dev_if;
-
- ret = ip_route_output_key(&init_net, &rt, &fl);
- if (ret)
- goto out;
-
- src_in->sin_family = AF_INET;
- src_in->sin_addr.s_addr = rt->rt_src;
-
- if (rt->idev->dev->flags & IFF_LOOPBACK) {
- ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
- if (!ret)
- memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
- goto put;
- }
-
- /* If the device does ARP internally, return 'done' */
- if (rt->idev->dev->flags & IFF_NOARP) {
- rdma_copy_addr(addr, rt->idev->dev, NULL);
- goto put;
- }
-
- neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
- if (!neigh || !(neigh->nud_state & NUD_VALID)) {
- neigh_event_send(rt->u.dst.neighbour, NULL);
- ret = -ENODATA;
- if (neigh)
- goto release;
- goto put;
- }
-
- ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
-release:
- neigh_release(neigh);
-put:
- ip_rt_put(rt);
-out:
- return ret;
-}
-
-#if defined(INET6)
-static int addr6_resolve(struct sockaddr_in6 *src_in,
- struct sockaddr_in6 *dst_in,
- struct rdma_dev_addr *addr)
-{
- struct flowi fl;
- struct neighbour *neigh;
- struct dst_entry *dst;
- int ret;
-
- memset(&fl, 0, sizeof fl);
- ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr);
- ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr);
- fl.oif = addr->bound_dev_if;
-
- dst = ip6_route_output(&init_net, NULL, &fl);
- if ((ret = dst->error))
- goto put;
-
- if (ipv6_addr_any(&fl.fl6_src)) {
- ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
- &fl.fl6_dst, 0, &fl.fl6_src);
- if (ret)
- goto put;
-
- src_in->sin6_family = AF_INET6;
- ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src);
- }
-
- if (dst->dev->flags & IFF_LOOPBACK) {
- ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
- if (!ret)
- memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
- goto put;
- }
-
- /* If the device does ARP internally, return 'done' */
- if (dst->dev->flags & IFF_NOARP) {
- ret = rdma_copy_addr(addr, dst->dev, NULL);
- goto put;
- }
-
- neigh = dst->neighbour;
- if (!neigh || !(neigh->nud_state & NUD_VALID)) {
- neigh_event_send(dst->neighbour, NULL);
- ret = -ENODATA;
- goto put;
- }
-
- ret = rdma_copy_addr(addr, dst->dev, neigh->ha);
-put:
- dst_release(dst);
- return ret;
-}
-#else
-static int addr6_resolve(struct sockaddr_in6 *src_in,
- struct sockaddr_in6 *dst_in,
- struct rdma_dev_addr *addr)
-{
- return -EADDRNOTAVAIL;
-}
-#endif
-
-#else
-#include <netinet/if_ether.h>
-
static int addr_resolve(struct sockaddr *src_in,
struct sockaddr *dst_in,
struct rdma_dev_addr *addr)
@@ -354,7 +209,6 @@ static int addr_resolve(struct sockaddr *src_in,
int bcast;
int is_gw = 0;
int error = 0;
-
/*
* Determine whether the address is unicast, multicast, or broadcast
* and whether the source interface is valid.
@@ -382,8 +236,7 @@ static int addr_resolve(struct sockaddr *src_in,
port = sin->sin_port;
sin->sin_port = 0;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
- } else
- src_in = NULL;
+ }
break;
#endif
#ifdef INET6
@@ -406,7 +259,7 @@ static int addr_resolve(struct sockaddr *src_in,
* If we have a source address to use look it up first and verify
* that it is a local interface.
*/
- if (src_in) {
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
ifa = ifa_ifwithaddr(src_in);
if (sin)
sin->sin_port = port;
@@ -436,15 +289,20 @@ static int addr_resolve(struct sockaddr *src_in,
* correct interface pointer and unlock the route.
*/
if (multi || bcast) {
- if (ifp == NULL)
+ if (ifp == NULL) {
ifp = rte->rt_ifp;
+ /* rt_ifa holds the route answer source address */
+ ifa = rte->rt_ifa;
+ }
RTFREE_LOCKED(rte);
} else if (ifp && ifp != rte->rt_ifp) {
RTFREE_LOCKED(rte);
return -ENETUNREACH;
} else {
- if (ifp == NULL)
+ if (ifp == NULL) {
ifp = rte->rt_ifp;
+ ifa = rte->rt_ifa;
+ }
RT_UNLOCK(rte);
}
mcast:
@@ -459,6 +317,8 @@ mcast:
error = rdma_copy_addr(addr, ifp,
LLADDR((struct sockaddr_dl *)llsa));
free(llsa, M_IFMADDR);
+ if (error == 0)
+ memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr));
return error;
}
/*
@@ -472,7 +332,7 @@ mcast:
#endif
#ifdef INET6
case AF_INET6:
- error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst,NULL);
+ error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst, NULL);
break;
#endif
default:
@@ -480,15 +340,15 @@ mcast:
error = -EINVAL;
}
RTFREE(rte);
- if (error == 0)
+ if (error == 0) {
+ memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr));
return rdma_copy_addr(addr, ifp, edst);
+ }
if (error == EWOULDBLOCK)
return -ENODATA;
return -error;
}
-#endif
-
static void process_req(struct work_struct *work)
{
struct addr_req *req, *temp_req;
@@ -602,20 +462,94 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
}
EXPORT_SYMBOL(rdma_addr_cancel);
+struct resolve_cb_context {
+ struct rdma_dev_addr *addr;
+ struct completion comp;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context)
+{
+ memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct
+ rdma_dev_addr));
+ complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
+ u16 *vlan_id)
+{
+ int ret = 0;
+ struct rdma_dev_addr dev_addr;
+ struct resolve_cb_context ctx;
+ struct net_device *dev;
+
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+
+
+ ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+ if (ret)
+ return ret;
+
+ ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid);
+ if (ret)
+ return ret;
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+
+ ctx.addr = &dev_addr;
+ init_completion(&ctx.comp);
+ ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
+ &dev_addr, 1000, resolve_cb, &ctx);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&ctx.comp);
+
+ memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+ dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(dev);
+ dev_put(dev);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
+
+int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
+{
+ int ret = 0;
+ struct rdma_dev_addr dev_addr;
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } gid_addr;
+
+ ret = rdma_gid2ip(&gid_addr._sockaddr, sgid);
+
+ if (ret)
+ return ret;
+ memset(&dev_addr, 0, sizeof(dev_addr));
+ ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
+ if (ret)
+ return ret;
+
+ memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
+
static int netevent_callback(struct notifier_block *self, unsigned long event,
void *ctx)
{
if (event == NETEVENT_NEIGH_UPDATE) {
-#ifdef __linux__
- struct neighbour *neigh = ctx;
-
- if (neigh->nud_state & NUD_VALID) {
set_timeout(jiffies);
}
-#else
- set_timeout(jiffies);
-#endif
- }
return 0;
}
@@ -631,11 +565,13 @@ static int __init addr_init(void)
return -ENOMEM;
register_netevent_notifier(&nb);
+ rdma_addr_register_client(&self);
return 0;
}
static void __exit addr_cleanup(void)
{
+ rdma_addr_unregister_client(&self);
unregister_netevent_notifier(&nb);
destroy_workqueue(addr_wq);
}
diff --git a/sys/ofed/drivers/infiniband/core/cache.c b/sys/ofed/drivers/infiniband/core/cache.c
index 660bff5..d11e7c2 100644
--- a/sys/ofed/drivers/infiniband/core/cache.c
+++ b/sys/ofed/drivers/infiniband/core/cache.c
@@ -76,19 +76,21 @@ int ib_get_cached_gid(struct ib_device *device,
{
struct ib_gid_cache *cache;
unsigned long flags;
- int ret = 0;
+ int ret = -EINVAL;
if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.gid_cache[port_num - start_port(device)];
+ if (device->cache.gid_cache) {
+ cache = device->cache.gid_cache[port_num - start_port(device)];
- if (index < 0 || index >= cache->table_len)
- ret = -EINVAL;
- else
- *gid = cache->table[index];
+ if (cache && index >= 0 && index < cache->table_len) {
+ *gid = cache->table[index];
+ ret = 0;
+ }
+ }
read_unlock_irqrestore(&device->cache.lock, flags);
@@ -111,22 +113,24 @@ int ib_find_cached_gid(struct ib_device *device,
*index = -1;
read_lock_irqsave(&device->cache.lock, flags);
-
+ if (!device->cache.gid_cache)
+ goto out;
for (p = 0; p <= end_port(device) - start_port(device); ++p) {
cache = device->cache.gid_cache[p];
+ if (!cache)
+ continue;
for (i = 0; i < cache->table_len; ++i) {
if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
*port_num = p + start_port(device);
if (index)
*index = i;
ret = 0;
- goto found;
+ goto out;
}
}
}
-found:
+out:
read_unlock_irqrestore(&device->cache.lock, flags);
-
return ret;
}
EXPORT_SYMBOL(ib_find_cached_gid);
@@ -138,19 +142,21 @@ int ib_get_cached_pkey(struct ib_device *device,
{
struct ib_pkey_cache *cache;
unsigned long flags;
- int ret = 0;
+ int ret = -EINVAL;
if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.pkey_cache[port_num - start_port(device)];
+ if (device->cache.pkey_cache) {
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
- if (index < 0 || index >= cache->table_len)
- ret = -EINVAL;
- else
- *pkey = cache->table[index];
+ if (cache && index >= 0 && index < cache->table_len) {
+ *pkey = cache->table[index];
+ ret = 0;
+ }
+ }
read_unlock_irqrestore(&device->cache.lock, flags);
@@ -167,41 +173,93 @@ int ib_find_cached_pkey(struct ib_device *device,
unsigned long flags;
int i;
int ret = -ENOENT;
+ int partial_ix = -1;
if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
+ *index = -1;
+
read_lock_irqsave(&device->cache.lock, flags);
+ if (!device->cache.pkey_cache)
+ goto out;
+
cache = device->cache.pkey_cache[port_num - start_port(device)];
+ if (!cache)
+ goto out;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ if (cache->table[i] & 0x8000) {
+ *index = i;
+ ret = 0;
+ break;
+ } else
+ partial_ix = i;
+ }
+
+ if (ret && partial_ix >= 0) {
+ *index = partial_ix;
+ ret = 0;
+ }
+out:
+ read_unlock_irqrestore(&device->cache.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
*index = -1;
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ if (!device->cache.pkey_cache)
+ goto out;
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+ if (!cache)
+ goto out;
+
for (i = 0; i < cache->table_len; ++i)
- if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ if (cache->table[i] == pkey) {
*index = i;
ret = 0;
break;
}
-
+out:
read_unlock_irqrestore(&device->cache.lock, flags);
-
return ret;
}
-EXPORT_SYMBOL(ib_find_cached_pkey);
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
int ib_get_cached_lmc(struct ib_device *device,
u8 port_num,
u8 *lmc)
{
unsigned long flags;
- int ret = 0;
+ int ret = -EINVAL;
if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- *lmc = device->cache.lmc_cache[port_num - start_port(device)];
+ if (device->cache.lmc_cache) {
+ *lmc = device->cache.lmc_cache[port_num - start_port(device)];
+ ret = 0;
+ }
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
@@ -217,6 +275,10 @@ static void ib_cache_update(struct ib_device *device,
int i;
int ret;
+ if (!(device->cache.pkey_cache && device->cache.gid_cache &&
+ device->cache.lmc_cache))
+ return;
+
tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
if (!tprops)
return;
@@ -309,7 +371,7 @@ static void ib_cache_event(struct ib_event_handler *handler,
INIT_WORK(&work->work, ib_cache_task);
work->device = event->device;
work->port_num = event->element.port_num;
- schedule_work(&work->work);
+ queue_work(ib_wq, &work->work);
}
}
}
@@ -362,14 +424,21 @@ err:
kfree(device->cache.pkey_cache);
kfree(device->cache.gid_cache);
kfree(device->cache.lmc_cache);
+ device->cache.pkey_cache = NULL;
+ device->cache.gid_cache = NULL;
+ device->cache.lmc_cache = NULL;
}
static void ib_cache_cleanup_one(struct ib_device *device)
{
int p;
+ if (!(device->cache.pkey_cache && device->cache.gid_cache &&
+ device->cache.lmc_cache))
+ return;
+
ib_unregister_event_handler(&device->cache.event_handler);
- flush_scheduled_work();
+ flush_workqueue(ib_wq);
for (p = 0; p <= end_port(device) - start_port(device); ++p) {
kfree(device->cache.pkey_cache[p]);
diff --git a/sys/ofed/drivers/infiniband/core/cm.c b/sys/ofed/drivers/infiniband/core/cm.c
index 3d2794d..07f6e08 100644
--- a/sys/ofed/drivers/infiniband/core/cm.c
+++ b/sys/ofed/drivers/infiniband/core/cm.c
@@ -36,16 +36,19 @@
#include <linux/completion.h>
#include <linux/dma-mapping.h>
#include <linux/device.h>
+#include <linux/module.h>
#include <linux/err.h>
#include <linux/idr.h>
#include <linux/interrupt.h>
#include <linux/random.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
+#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/workqueue.h>
#include <linux/kdev_t.h>
#include <linux/string.h>
+#include <linux/etherdevice.h>
#include <asm/atomic-long.h>
@@ -57,16 +60,10 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
-#define PFX "ib_cm: "
-
-/*
- * Limit CM message timeouts to something reasonable:
- * 8 seconds per message, with up to 15 retries
- */
-static int max_timeout = 21;
-module_param(max_timeout, int, 0644);
-MODULE_PARM_DESC(max_timeout, "Maximum IB CM per message timeout "
- "(default=21, or ~8 seconds)");
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
static void cm_add_one(struct ib_device *device);
static void cm_remove_one(struct ib_device *device);
@@ -189,6 +186,8 @@ struct cm_av {
struct ib_ah_attr ah_attr;
u16 pkey_index;
u8 timeout;
+ u8 valid;
+ u8 smac[ETH_ALEN];
};
struct cm_work {
@@ -358,6 +357,23 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
grh, &av->ah_attr);
}
+int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac)
+{
+ struct cm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(id, struct cm_id_private, id);
+
+ if (smac != NULL)
+ memcpy(cm_id_priv->av.smac, smac, sizeof(cm_id_priv->av.smac));
+
+ if (alt_smac != NULL)
+ memcpy(cm_id_priv->alt_av.smac, alt_smac,
+ sizeof(cm_id_priv->alt_av.smac));
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_update_cm_av);
+
static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
{
struct cm_device *cm_dev;
@@ -388,6 +404,9 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
&av->ah_attr);
av->timeout = path->packet_life_time + 1;
+ memcpy(av->smac, path->smac, sizeof(av->smac));
+
+ av->valid = 1;
return 0;
}
@@ -402,7 +421,7 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv)
ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
next_id, &id);
if (!ret)
- next_id = ((unsigned) id + 1) & MAX_ID_MASK;
+ next_id = ((unsigned) id + 1) & MAX_IDR_MASK;
spin_unlock_irqrestore(&cm.lock, flags);
} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
@@ -794,11 +813,11 @@ static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
}
}
-static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id, gfp_t flags)
{
struct cm_timewait_info *timewait_info;
- timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+ timewait_info = kzalloc(sizeof *timewait_info, flags);
if (!timewait_info)
return ERR_PTR(-ENOMEM);
@@ -902,6 +921,8 @@ retest:
break;
case IB_CM_ESTABLISHED:
spin_unlock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->qp_type == IB_QPT_XRC_TGT)
+ break;
ib_send_cm_dreq(cm_id, NULL, 0);
goto retest;
case IB_CM_DREQ_SENT:
@@ -1021,33 +1042,24 @@ static void cm_format_req(struct cm_req_msg *req_msg,
req_msg->service_id = param->service_id;
req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
- cm_req_set_resp_res(req_msg, param->responder_resources);
cm_req_set_init_depth(req_msg, param->initiator_depth);
cm_req_set_remote_resp_timeout(req_msg,
param->remote_cm_response_timeout);
- if (param->remote_cm_response_timeout > (u8) max_timeout) {
- printk(KERN_WARNING PFX "req remote_cm_response_timeout %d > "
- "%d, decreasing\n", param->remote_cm_response_timeout,
- max_timeout);
- cm_req_set_remote_resp_timeout(req_msg, (u8) max_timeout);
- }
cm_req_set_qp_type(req_msg, param->qp_type);
cm_req_set_flow_ctrl(req_msg, param->flow_control);
cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
cm_req_set_local_resp_timeout(req_msg,
param->local_cm_response_timeout);
- if (param->local_cm_response_timeout > (u8) max_timeout) {
- printk(KERN_WARNING PFX "req local_cm_response_timeout %d > "
- "%d, decreasing\n", param->local_cm_response_timeout,
- max_timeout);
- cm_req_set_local_resp_timeout(req_msg, (u8) max_timeout);
- }
- cm_req_set_retry_count(req_msg, param->retry_count);
req_msg->pkey = param->primary_path->pkey;
cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
- cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+
+ if (param->qp_type != IB_QPT_XRC_INI) {
+ cm_req_set_resp_res(req_msg, param->responder_resources);
+ cm_req_set_retry_count(req_msg, param->retry_count);
+ cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
cm_req_set_srq(req_msg, param->srq);
+ }
if (pri_path->hop_limit <= 1) {
req_msg->primary_local_lid = pri_path->slid;
@@ -1105,7 +1117,8 @@ static int cm_validate_req_param(struct ib_cm_req_param *param)
if (!param->primary_path)
return -EINVAL;
- if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC)
+ if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
+ param->qp_type != IB_QPT_XRC_INI)
return -EINVAL;
if (param->private_data &&
@@ -1137,38 +1150,34 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_IDLE) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
- id.local_id);
+ id.local_id,
+ GFP_ATOMIC);
if (IS_ERR(cm_id_priv->timewait_info)) {
- ret = PTR_ERR(cm_id_priv->timewait_info);
- goto out;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return (PTR_ERR(cm_id_priv->timewait_info));
}
ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
- if (ret)
- goto error1;
- if (param->alternate_path) {
+ if (!ret && param->alternate_path) {
ret = cm_init_av_by_path(param->alternate_path,
&cm_id_priv->alt_av);
- if (ret)
+ }
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
goto error1;
}
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
cm_id->service_id = param->service_id;
cm_id->service_mask = ~cpu_to_be64(0);
cm_id_priv->timeout_ms = cm_convert_to_ms(
param->primary_path->packet_life_time) * 2 +
cm_convert_to_ms(
param->remote_cm_response_timeout);
- if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
- printk(KERN_WARNING PFX "req timeout_ms %d > %d, decreasing\n",
- cm_id_priv->timeout_ms, cm_convert_to_ms(max_timeout));
- cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
- }
cm_id_priv->max_cm_retries = param->max_cm_retries;
cm_id_priv->initiator_depth = param->initiator_depth;
cm_id_priv->responder_resources = param->responder_resources;
@@ -1201,9 +1210,11 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
-error2: cm_free_msg(cm_id_priv->msg);
-error1: kfree(cm_id_priv->timewait_info);
-out: return ret;
+error2:
+ cm_free_msg(cm_id_priv->msg);
+error1:
+ kfree(cm_id_priv->timewait_info);
+ return ret;
}
EXPORT_SYMBOL(ib_send_cm_req);
@@ -1556,7 +1567,8 @@ static int cm_req_handler(struct cm_work *work)
work->mad_recv_wc->recv_buf.grh,
&cm_id_priv->av);
cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
- id.local_id);
+ id.local_id,
+ GFP_KERNEL);
if (IS_ERR(cm_id_priv->timewait_info)) {
ret = PTR_ERR(cm_id_priv->timewait_info);
goto destroy;
@@ -1579,6 +1591,10 @@ static int cm_req_handler(struct cm_work *work)
cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
+
+ /* Workarround: path in req_msg doesn't contain MAC, take it from wc */
+ memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, 6);
+ work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
if (ret) {
ib_get_cached_gid(work->port->cm_dev->ib_device,
@@ -1600,13 +1616,6 @@ static int cm_req_handler(struct cm_work *work)
cm_id_priv->tid = req_msg->hdr.tid;
cm_id_priv->timeout_ms = cm_convert_to_ms(
cm_req_get_local_resp_timeout(req_msg));
- if (cm_req_get_local_resp_timeout(req_msg) > (u8) max_timeout) {
- printk(KERN_WARNING PFX "rcvd cm_local_resp_timeout %d > %d, "
- "decreasing used timeout_ms\n",
- cm_req_get_local_resp_timeout(req_msg), max_timeout);
- cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
- }
-
cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
@@ -1638,18 +1647,24 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg,
cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
rep_msg->local_comm_id = cm_id_priv->id.local_id;
rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
- cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
rep_msg->resp_resources = param->responder_resources;
- rep_msg->initiator_depth = param->initiator_depth;
cm_rep_set_target_ack_delay(rep_msg,
cm_id_priv->av.port->cm_dev->ack_delay);
cm_rep_set_failover(rep_msg, param->failover_accepted);
- cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
- cm_rep_set_srq(rep_msg, param->srq);
rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+ if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
+ rep_msg->initiator_depth = param->initiator_depth;
+ cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+ cm_rep_set_srq(rep_msg, param->srq);
+ cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+ } else {
+ cm_rep_set_srq(rep_msg, 1);
+ cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num));
+ }
+
if (param->private_data && param->private_data_len)
memcpy(rep_msg->private_data, param->private_data,
param->private_data_len);
@@ -1672,6 +1687,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_REQ_RCVD &&
cm_id->state != IB_CM_MRA_REQ_SENT) {
+ pr_debug("cm_id->state: %d\n", cm_id->state);
ret = -EINVAL;
goto out;
}
@@ -1697,7 +1713,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
cm_id_priv->initiator_depth = param->initiator_depth;
cm_id_priv->responder_resources = param->responder_resources;
cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
- cm_id_priv->local_qpn = cm_rep_get_local_qpn(rep_msg);
+ cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
@@ -1738,6 +1754,7 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_REP_RCVD &&
cm_id->state != IB_CM_MRA_REP_SENT) {
+ pr_debug("cm_id->state: %d\n", cm_id->state);
ret = -EINVAL;
goto error;
}
@@ -1768,7 +1785,7 @@ error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
}
EXPORT_SYMBOL(ib_send_cm_rtu);
-static void cm_format_rep_event(struct cm_work *work)
+static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
{
struct cm_rep_msg *rep_msg;
struct ib_cm_rep_event_param *param;
@@ -1777,7 +1794,7 @@ static void cm_format_rep_event(struct cm_work *work)
param = &work->cm_event.param.rep_rcvd;
param->remote_ca_guid = rep_msg->local_ca_guid;
param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
- param->remote_qpn = be32_to_cpu(cm_rep_get_local_qpn(rep_msg));
+ param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
param->responder_resources = rep_msg->initiator_depth;
param->initiator_depth = rep_msg->resp_resources;
@@ -1842,10 +1859,11 @@ static int cm_rep_handler(struct cm_work *work)
cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
if (!cm_id_priv) {
cm_dup_rep_handler(work);
+ pr_debug("no cm_id_priv\n");
return -EINVAL;
}
- cm_format_rep_event(work);
+ cm_format_rep_event(work, cm_id_priv->qp_type);
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id_priv->id.state) {
@@ -1855,12 +1873,13 @@ static int cm_rep_handler(struct cm_work *work)
default:
spin_unlock_irq(&cm_id_priv->lock);
ret = -EINVAL;
+ pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state);
goto error;
}
cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
- cm_id_priv->timewait_info->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+ cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
spin_lock(&cm.lock);
/* Check for duplicate REP. */
@@ -1868,6 +1887,7 @@ static int cm_rep_handler(struct cm_work *work)
spin_unlock(&cm.lock);
spin_unlock_irq(&cm_id_priv->lock);
ret = -EINVAL;
+ pr_debug("Failed to insert remote id\n");
goto error;
}
/* Check for a stale connection. */
@@ -1881,13 +1901,14 @@ static int cm_rep_handler(struct cm_work *work)
IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
NULL, 0);
ret = -EINVAL;
+ pr_debug("Stale connection.\n");
goto error;
}
spin_unlock(&cm.lock);
cm_id_priv->id.state = IB_CM_REP_RCVD;
cm_id_priv->id.remote_id = rep_msg->local_comm_id;
- cm_id_priv->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+ cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
cm_id_priv->initiator_depth = rep_msg->resp_resources;
cm_id_priv->responder_resources = rep_msg->initiator_depth;
cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
@@ -2021,10 +2042,15 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id,
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_ESTABLISHED) {
+ pr_debug("cm_id->state: %d\n", cm_id->state);
ret = -EINVAL;
goto out;
}
+ if (cm_id->lap_state == IB_CM_LAP_SENT ||
+ cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+
ret = cm_alloc_msg(cm_id_priv, &msg);
if (ret) {
cm_enter_timewait(cm_id_priv);
@@ -2086,6 +2112,7 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id,
if (cm_id->state != IB_CM_DREQ_RCVD) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
kfree(data);
+ pr_debug("cm_id->state(%d) != IB_CM_DREQ_RCVD\n", cm_id->state);
return -EINVAL;
}
@@ -2151,6 +2178,7 @@ static int cm_dreq_handler(struct cm_work *work)
atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
counter[CM_DREQ_COUNTER]);
cm_issue_drep(work->port, work->mad_recv_wc);
+ pr_debug("no cm_id_priv\n");
return -EINVAL;
}
@@ -2166,6 +2194,10 @@ static int cm_dreq_handler(struct cm_work *work)
ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
break;
case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+ cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ break;
case IB_CM_MRA_REP_RCVD:
break;
case IB_CM_TIMEWAIT:
@@ -2187,6 +2219,7 @@ static int cm_dreq_handler(struct cm_work *work)
counter[CM_DREQ_COUNTER]);
goto unlock;
default:
+ pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state);
goto unlock;
}
cm_id_priv->id.state = IB_CM_DREQ_RCVD;
@@ -2290,6 +2323,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
cm_enter_timewait(cm_id_priv);
break;
default:
+ pr_debug("cm_id->state: 0x%x\n", cm_id->state);
ret = -EINVAL;
goto out;
}
@@ -2386,11 +2420,21 @@ static int cm_rej_handler(struct cm_work *work)
/* fall through */
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
+ cm_enter_timewait(cm_id_priv);
+ break;
case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+ cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg);
cm_enter_timewait(cm_id_priv);
break;
+ }
+ /* fall through */
default:
spin_unlock_irq(&cm_id_priv->lock);
+ pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
goto out;
}
@@ -2453,6 +2497,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
break;
}
default:
+ pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
goto error1;
}
@@ -2518,12 +2563,6 @@ static int cm_mra_handler(struct cm_work *work)
cm_mra_get_service_timeout(mra_msg);
timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
cm_convert_to_ms(cm_id_priv->av.timeout);
- if (timeout > cm_convert_to_ms(max_timeout)) {
- printk(KERN_WARNING PFX "calculated mra timeout %d > %d, "
- "decreasing used timeout_ms\n", timeout,
- cm_convert_to_ms(max_timeout));
- timeout = cm_convert_to_ms(max_timeout);
- }
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id_priv->id.state) {
@@ -2560,6 +2599,7 @@ static int cm_mra_handler(struct cm_work *work)
counter[CM_MRA_COUNTER]);
/* fall through */
default:
+ pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
goto out;
}
@@ -2746,7 +2786,8 @@ static int cm_lap_handler(struct cm_work *work)
cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
work->mad_recv_wc->recv_buf.grh,
&cm_id_priv->av);
- cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+ if (cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av))
+ goto unlock;
ret = atomic_inc_and_test(&cm_id_priv->work_count);
if (!ret)
list_add_tail(&work->list, &cm_id_priv->work_list);
@@ -2938,6 +2979,9 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
return -EINVAL;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
if (ret)
goto out;
@@ -2945,12 +2989,6 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
cm_id->service_id = param->service_id;
cm_id->service_mask = ~cpu_to_be64(0);
cm_id_priv->timeout_ms = param->timeout_ms;
- if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
- printk(KERN_WARNING PFX "sidr req timeout_ms %d > %d, "
- "decreasing used timeout_ms\n", param->timeout_ms,
- cm_convert_to_ms(max_timeout));
- cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
- }
cm_id_priv->max_cm_retries = param->max_cm_retries;
ret = cm_alloc_msg(cm_id_priv, &msg);
if (ret)
@@ -2961,21 +2999,19 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
msg->timeout_ms = cm_id_priv->timeout_ms;
msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
- spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state == IB_CM_IDLE)
ret = ib_post_send_mad(msg, NULL);
else
ret = -EINVAL;
if (ret) {
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
cm_free_msg(msg);
goto out;
}
cm_id->state = IB_CM_SIDR_REQ_SENT;
cm_id_priv->msg = msg;
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
out:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_sidr_req);
@@ -3038,6 +3074,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
goto out; /* No match. */
}
atomic_inc(&cur_cm_id_priv->refcount);
+ atomic_inc(&cm_id_priv->refcount);
spin_unlock_irq(&cm.lock);
cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
@@ -3302,6 +3339,7 @@ static void cm_work_handler(struct work_struct *_work)
ret = cm_timewait_handler(work);
break;
default:
+ pr_debug("work->cm_event.event: 0x%x\n", work->cm_event.event);
ret = -EINVAL;
break;
}
@@ -3332,6 +3370,7 @@ static int cm_establish(struct ib_cm_id *cm_id)
ret = -EISCONN;
break;
default:
+ pr_debug("cm_id->state: 0x%x\n", cm_id->state);
ret = -EINVAL;
break;
}
@@ -3494,6 +3533,7 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
ret = 0;
break;
default:
+ pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
break;
}
@@ -3520,10 +3560,36 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
IB_QP_DEST_QPN | IB_QP_RQ_PSN;
qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+ if (!cm_id_priv->av.valid)
+ return -EINVAL;
+ if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) {
+ qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+ *qp_attr_mask |= IB_QP_VID;
+ }
+ if (!is_zero_ether_addr(cm_id_priv->av.smac)) {
+ memcpy(qp_attr->smac, cm_id_priv->av.smac,
+ sizeof(qp_attr->smac));
+ *qp_attr_mask |= IB_QP_SMAC;
+ }
+ if (cm_id_priv->alt_av.valid) {
+ if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) {
+ qp_attr->alt_vlan_id =
+ cm_id_priv->alt_av.ah_attr.vlan_id;
+ *qp_attr_mask |= IB_QP_ALT_VID;
+ }
+ if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) {
+ memcpy(qp_attr->alt_smac,
+ cm_id_priv->alt_av.smac,
+ sizeof(qp_attr->alt_smac));
+ *qp_attr_mask |= IB_QP_ALT_SMAC;
+ }
+ }
+
qp_attr->path_mtu = cm_id_priv->path_mtu;
qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
- if (cm_id_priv->qp_type == IB_QPT_RC) {
+ if (cm_id_priv->qp_type == IB_QPT_RC ||
+ cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
*qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
IB_QP_MIN_RNR_TIMER;
qp_attr->max_dest_rd_atomic =
@@ -3540,6 +3606,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
ret = 0;
break;
default:
+ pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
break;
}
@@ -3568,15 +3635,21 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
*qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
- if (cm_id_priv->qp_type == IB_QPT_RC) {
- *qp_attr_mask |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
- IB_QP_RNR_RETRY |
+ switch (cm_id_priv->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_XRC_INI:
+ *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
IB_QP_MAX_QP_RD_ATOMIC;
- qp_attr->timeout = cm_id_priv->av.timeout;
qp_attr->retry_cnt = cm_id_priv->retry_count;
qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
- qp_attr->max_rd_atomic =
- cm_id_priv->initiator_depth;
+ qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
+ /* fall through */
+ case IB_QPT_XRC_TGT:
+ *qp_attr_mask |= IB_QP_TIMEOUT;
+ qp_attr->timeout = cm_id_priv->av.timeout;
+ break;
+ default:
+ break;
}
if (cm_id_priv->alt_av.ah_attr.dlid) {
*qp_attr_mask |= IB_QP_PATH_MIG_STATE;
@@ -3593,6 +3666,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
ret = 0;
break;
default:
+ pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
break;
}
@@ -3619,6 +3693,7 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
break;
default:
+ pr_debug("qp_attr->qp_state: 0x%x\n", qp_attr->qp_state);
ret = -EINVAL;
break;
}
@@ -3649,7 +3724,7 @@ static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
atomic_long_read(&group->counter[cm_attr->index]));
}
-static struct sysfs_ops cm_counter_ops = {
+static const struct sysfs_ops cm_counter_ops = {
.show = cm_show_counter
};
@@ -3670,8 +3745,17 @@ static struct kobj_type cm_port_obj_type = {
.release = cm_release_port_obj
};
+static char *cm_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
struct class cm_class = {
+ .owner = THIS_MODULE,
.name = "infiniband_cm",
+ .devnode = cm_devnode,
};
EXPORT_SYMBOL(cm_class);
@@ -3745,7 +3829,7 @@ static void cm_add_one(struct ib_device *ib_device)
cm_dev->device = device_create(&cm_class, &ib_device->dev,
MKDEV(0, 0), NULL,
"%s", ib_device->name);
- if (!cm_dev->device) {
+ if (IS_ERR(cm_dev->device)) {
kfree(cm_dev);
return;
}
@@ -3846,28 +3930,33 @@ static int __init ib_cm_init(void)
cm.remote_sidr_table = RB_ROOT;
idr_init(&cm.local_id_table);
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
- idr_pre_get(&cm.local_id_table, GFP_KERNEL);
+ if (!idr_pre_get(&cm.local_id_table, GFP_KERNEL))
+ return -ENOMEM;
INIT_LIST_HEAD(&cm.timewait_list);
ret = class_register(&cm_class);
- if (ret)
- return -ENOMEM;
+ if (ret) {
+ ret = -ENOMEM;
+ goto error1;
+ }
cm.wq = create_workqueue("ib_cm");
if (!cm.wq) {
ret = -ENOMEM;
- goto error1;
+ goto error2;
}
ret = ib_register_client(&cm_client);
if (ret)
- goto error2;
+ goto error3;
return 0;
-error2:
+error3:
destroy_workqueue(cm.wq);
-error1:
+error2:
class_unregister(&cm_class);
+error1:
+ idr_destroy(&cm.local_id_table);
return ret;
}
diff --git a/sys/ofed/drivers/infiniband/core/cm_msgs.h b/sys/ofed/drivers/infiniband/core/cm_msgs.h
index 7e63c08..be068f4 100644
--- a/sys/ofed/drivers/infiniband/core/cm_msgs.h
+++ b/sys/ofed/drivers/infiniband/core/cm_msgs.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved.
* Copyright (c) 2004 Topspin Corporation. All rights reserved.
* Copyright (c) 2004 Voltaire Corporation. All rights reserved.
*
@@ -44,18 +44,6 @@
#define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */
-#define CM_REQ_ATTR_ID cpu_to_be16(0x0010)
-#define CM_MRA_ATTR_ID cpu_to_be16(0x0011)
-#define CM_REJ_ATTR_ID cpu_to_be16(0x0012)
-#define CM_REP_ATTR_ID cpu_to_be16(0x0013)
-#define CM_RTU_ATTR_ID cpu_to_be16(0x0014)
-#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015)
-#define CM_DREP_ATTR_ID cpu_to_be16(0x0016)
-#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017)
-#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018)
-#define CM_LAP_ATTR_ID cpu_to_be16(0x0019)
-#define CM_APR_ATTR_ID cpu_to_be16(0x001A)
-
enum cm_msg_sequence {
CM_MSG_SEQUENCE_REQ,
CM_MSG_SEQUENCE_LAP,
@@ -86,7 +74,7 @@ struct cm_req_msg {
__be16 pkey;
/* path MTU:4, RDC exists:1, RNR retry count:3. */
u8 offset50;
- /* max CM Retries:4, SRQ:1, rsvd:3 */
+ /* max CM Retries:4, SRQ:1, extended transport type:3 */
u8 offset51;
__be16 primary_local_lid;
@@ -175,6 +163,11 @@ static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
switch(transport_type) {
case 0: return IB_QPT_RC;
case 1: return IB_QPT_UC;
+ case 3:
+ switch (req_msg->offset51 & 0x7) {
+ case 1: return IB_QPT_XRC_TGT;
+ default: return 0;
+ }
default: return 0;
}
}
@@ -188,6 +181,12 @@ static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
req_msg->offset40) &
0xFFFFFFF9) | 0x2);
break;
+ case IB_QPT_XRC_INI:
+ req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9) | 0x6);
+ req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1;
+ break;
default:
req_msg->offset40 = cpu_to_be32(be32_to_cpu(
req_msg->offset40) &
@@ -527,6 +526,23 @@ static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
(be32_to_cpu(rep_msg->offset12) & 0x000000FF));
}
+static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8);
+}
+
+static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn)
+{
+ rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) |
+ (be32_to_cpu(rep_msg->offset16) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type)
+{
+ return (qp_type == IB_QPT_XRC_INI) ?
+ cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg);
+}
+
static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
{
return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
@@ -771,6 +787,7 @@ struct cm_apr_msg {
u8 info_length;
u8 ap_status;
+ __be16 rsvd;
u8 info[IB_CM_APR_INFO_LENGTH];
u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c
index 318beb1..d2064b6 100644
--- a/sys/ofed/drivers/infiniband/core/cma.c
+++ b/sys/ofed/drivers/infiniband/core/cma.c
@@ -40,6 +40,10 @@
#include <linux/random.h>
#include <linux/idr.h>
#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <net/route.h>
#include <net/tcp.h>
#include <net/ipv6.h>
@@ -55,28 +59,47 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("Generic RDMA CM Agent");
MODULE_LICENSE("Dual BSD/GPL");
-static int tavor_quirk = 0;
-module_param_named(tavor_quirk, tavor_quirk, int, 0644);
-MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0");
-
-int unify_tcp_port_space = 1;
-module_param(unify_tcp_port_space, int, 0644);
-MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
- "space allocation (default=1)");
-
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
-#define IBOE_PACKET_LIFETIME 18
+#define CMA_IBOE_PACKET_LIFETIME 18
static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
module_param_named(cma_response_timeout, cma_response_timeout, int, 0644);
-MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT default=20");
+MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT (default=20)");
static int def_prec2sl = 3;
module_param_named(def_prec2sl, def_prec2sl, int, 0644);
MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
+static int debug_level = 0;
+#define cma_pr(level, priv, format, arg...) \
+ printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg)
+
+#define cma_dbg(priv, format, arg...) \
+ do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0)
+
+#define cma_warn(priv, format, arg...) \
+ cma_pr(KERN_WARNING, priv, format, ## arg)
+
+#define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x"
+#define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\
+ ((u8 *)(gid))[13],\
+ ((u8 *)(gid))[14],\
+ ((u8 *)(gid))[15]
+
+#define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw)
+#define cma_debug_path(priv, pfx, p) \
+ cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" \
+ CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), \
+ CMA_GID_ARG(p.dgid))
+
+#define cma_debug_gid(priv, g) \
+ cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g)
+
+module_param_named(debug_level, debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "debug level default=0");
+
static void cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device);
@@ -92,13 +115,12 @@ static LIST_HEAD(dev_list);
static LIST_HEAD(listen_any_list);
static DEFINE_MUTEX(lock);
static struct workqueue_struct *cma_wq;
+static struct workqueue_struct *cma_free_wq;
static DEFINE_IDR(sdp_ps);
static DEFINE_IDR(tcp_ps);
static DEFINE_IDR(udp_ps);
static DEFINE_IDR(ipoib_ps);
-#if defined(INET)
-static int next_port;
-#endif
+static DEFINE_IDR(ib_ps);
struct cma_device {
struct list_head list;
@@ -108,26 +130,16 @@ struct cma_device {
struct list_head id_list;
};
-enum cma_state {
- CMA_IDLE,
- CMA_ADDR_QUERY,
- CMA_ADDR_RESOLVED,
- CMA_ROUTE_QUERY,
- CMA_ROUTE_RESOLVED,
- CMA_CONNECT,
- CMA_DISCONNECT,
- CMA_ADDR_BOUND,
- CMA_LISTEN,
- CMA_DEVICE_REMOVAL,
- CMA_DESTROYING
-};
-
struct rdma_bind_list {
struct idr *ps;
struct hlist_head owners;
unsigned short port;
};
+enum {
+ CMA_OPTION_AFONLY,
+};
+
/*
* Device removal can occur at anytime, so we need extra handling to
* serialize notifying the user of device removal with other callbacks.
@@ -138,7 +150,7 @@ struct rdma_id_private {
struct rdma_cm_id id;
struct rdma_bind_list *bind_list;
- struct socket *sock;
+ struct socket *sock;
struct hlist_node node;
struct list_head list; /* listen_any_list or cma_device.list */
struct list_head listen_list; /* per device listens */
@@ -146,13 +158,15 @@ struct rdma_id_private {
struct list_head mc_list;
int internal_id;
- enum cma_state state;
+ enum rdma_cm_state state;
spinlock_t lock;
+ spinlock_t cm_lock;
struct mutex qp_mutex;
struct completion comp;
atomic_t refcount;
struct mutex handler_mutex;
+ struct work_struct work; /* garbage coll */
int backlog;
int timeout_ms;
@@ -166,8 +180,16 @@ struct rdma_id_private {
u32 seq_num;
u32 qkey;
u32 qp_num;
+ pid_t owner;
+ u32 options;
u8 srq;
u8 tos;
+ u8 reuseaddr;
+ u8 afonly;
+ int qp_timeout;
+ /* cache for mc record params */
+ struct ib_sa_mcmember_rec rec;
+ int is_valid_rec;
};
struct cma_multicast {
@@ -184,8 +206,8 @@ struct cma_multicast {
struct cma_work {
struct work_struct work;
struct rdma_id_private *id;
- enum cma_state old_state;
- enum cma_state new_state;
+ enum rdma_cm_state old_state;
+ enum rdma_cm_state new_state;
struct rdma_cm_event event;
};
@@ -236,7 +258,7 @@ struct sdp_hah {
#define CMA_VERSION 0x00
#define SDP_MAJ_VERSION 0x2
-static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
+static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
{
unsigned long flags;
int ret;
@@ -248,7 +270,7 @@ static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
}
static int cma_comp_exch(struct rdma_id_private *id_priv,
- enum cma_state comp, enum cma_state exch)
+ enum rdma_cm_state comp, enum rdma_cm_state exch)
{
unsigned long flags;
int ret;
@@ -260,11 +282,11 @@ static int cma_comp_exch(struct rdma_id_private *id_priv,
return ret;
}
-static enum cma_state cma_exch(struct rdma_id_private *id_priv,
- enum cma_state exch)
+static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
+ enum rdma_cm_state exch)
{
unsigned long flags;
- enum cma_state old;
+ enum rdma_cm_state old;
spin_lock_irqsave(&id_priv->lock, flags);
old = id_priv->state;
@@ -298,11 +320,6 @@ static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
}
-static inline int cma_is_ud_ps(enum rdma_port_space ps)
-{
- return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
-}
-
static void cma_attach_to_dev(struct rdma_id_private *id_priv,
struct cma_device *cma_dev)
{
@@ -328,11 +345,13 @@ static inline void release_mc(struct kref *kref)
kfree(mc);
}
-static void cma_detach_from_dev(struct rdma_id_private *id_priv)
+static void cma_release_dev(struct rdma_id_private *id_priv)
{
+ mutex_lock(&lock);
list_del(&id_priv->list);
cma_deref_dev(id_priv->cma_dev);
id_priv->cma_dev = NULL;
+ mutex_unlock(&lock);
}
static int cma_set_qkey(struct rdma_id_private *id_priv)
@@ -361,36 +380,71 @@ static int cma_set_qkey(struct rdma_id_private *id_priv)
return ret;
}
+static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num)
+{
+ int i;
+ int err;
+ struct ib_port_attr props;
+ union ib_gid tmp;
+
+ err = ib_query_port(device, port_num, &props);
+ if (err)
+ return 1;
+
+ for (i = 0; i < props.gid_tbl_len; ++i) {
+ err = ib_query_gid(device, port_num, i, &tmp);
+ if (err)
+ return 1;
+ if (!memcmp(&tmp, gid, sizeof tmp))
+ return 0;
+ }
+
+ return -EAGAIN;
+}
+
static int cma_acquire_dev(struct rdma_id_private *id_priv)
{
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
struct cma_device *cma_dev;
- union ib_gid gid;
+ union ib_gid gid, iboe_gid;
int ret = -ENODEV;
+ u8 port;
+ enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
+ IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
- if (dev_addr->dev_type != ARPHRD_INFINIBAND) {
- iboe_addr_get_sgid(dev_addr, &gid);
- list_for_each_entry(cma_dev, &dev_list, list) {
- ret = ib_find_cached_gid(cma_dev->device, &gid,
- &id_priv->id.port_num, NULL);
- if (!ret)
- goto out;
- }
- }
+ if (dev_ll != IB_LINK_LAYER_INFINIBAND &&
+ id_priv->id.ps == RDMA_PS_IPOIB)
+ return -EINVAL;
+
+ mutex_lock(&lock);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &iboe_gid);
memcpy(&gid, dev_addr->src_dev_addr +
rdma_addr_gid_offset(dev_addr), sizeof gid);
list_for_each_entry(cma_dev, &dev_list, list) {
- ret = ib_find_cached_gid(cma_dev->device, &gid,
- &id_priv->id.port_num, NULL);
- if (!ret)
+ for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+ if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
+ if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+ rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+ ret = find_gid_port(cma_dev->device, &iboe_gid, port);
+ else
+ ret = find_gid_port(cma_dev->device, &gid, port);
+
+ if (!ret) {
+ id_priv->id.port_num = port;
+ goto out;
+ } else if (ret == 1)
break;
}
+ }
+ }
out:
if (!ret)
cma_attach_to_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
return ret;
}
@@ -401,7 +455,7 @@ static void cma_deref_id(struct rdma_id_private *id_priv)
}
static int cma_disable_callback(struct rdma_id_private *id_priv,
- enum cma_state state)
+ enum rdma_cm_state state)
{
mutex_lock(&id_priv->handler_mutex);
if (id_priv->state != state) {
@@ -411,13 +465,9 @@ static int cma_disable_callback(struct rdma_id_private *id_priv,
return 0;
}
-static int cma_has_cm_dev(struct rdma_id_private *id_priv)
-{
- return (id_priv->id.device && id_priv->cm_id.ib);
-}
-
struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
- void *context, enum rdma_port_space ps)
+ void *context, enum rdma_port_space ps,
+ enum ib_qp_type qp_type)
{
struct rdma_id_private *id_priv;
@@ -425,11 +475,14 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
if (!id_priv)
return ERR_PTR(-ENOMEM);
- id_priv->state = CMA_IDLE;
+ id_priv->owner = curthread->td_proc->p_pid;
+ id_priv->state = RDMA_CM_IDLE;
id_priv->id.context = context;
id_priv->id.event_handler = event_handler;
id_priv->id.ps = ps;
+ id_priv->id.qp_type = qp_type;
spin_lock_init(&id_priv->lock);
+ spin_lock_init(&id_priv->cm_lock);
mutex_init(&id_priv->qp_mutex);
init_completion(&id_priv->comp);
atomic_set(&id_priv->refcount, 1);
@@ -496,7 +549,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
if (IS_ERR(qp))
return PTR_ERR(qp);
- if (cma_is_ud_ps(id_priv->id.ps))
+ if (id->qp_type == IB_QPT_UD)
ret = cma_init_ud_qp(id_priv, qp);
else
ret = cma_init_conn_qp(id_priv, qp);
@@ -530,6 +583,7 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
{
struct ib_qp_attr qp_attr;
int qp_attr_mask, ret;
+ union ib_gid sgid;
mutex_lock(&id_priv->qp_mutex);
if (!id_priv->id.qp) {
@@ -551,6 +605,20 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
if (ret)
goto out;
+ ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num,
+ qp_attr.ah_attr.grh.sgid_index, &sgid);
+ if (ret)
+ goto out;
+
+ if (rdma_node_get_transport(id_priv->cma_dev->device->node_type)
+ == RDMA_TRANSPORT_IB &&
+ rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)
+ == IB_LINK_LAYER_ETHERNET) {
+ ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL);
+
+ if (ret)
+ goto out;
+ }
if (conn_param)
qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
@@ -579,6 +647,12 @@ static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
if (conn_param)
qp_attr.max_rd_atomic = conn_param->initiator_depth;
+
+ if (id_priv->qp_timeout && id_priv->id.qp->qp_type == IB_QPT_RC) {
+ qp_attr.timeout = id_priv->qp_timeout;
+ qp_attr_mask |= IB_QP_TIMEOUT;
+ }
+
ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
out:
mutex_unlock(&id_priv->qp_mutex);
@@ -624,7 +698,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
qp_attr->port_num = id_priv->id.port_num;
*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
- if (cma_is_ud_ps(id_priv->id.ps)) {
+ if (id_priv->id.qp_type == IB_QPT_UD) {
ret = cma_set_qkey(id_priv);
if (ret)
return ret;
@@ -647,7 +721,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
id_priv = container_of(id, struct rdma_id_private, id);
switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
case RDMA_TRANSPORT_IB:
- if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
+ if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
else
ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
@@ -656,6 +730,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
qp_attr->rq_psn = id_priv->seq_num;
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
if (!id_priv->cm_id.iw) {
qp_attr->qp_access_flags = 0;
*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
@@ -701,6 +776,21 @@ static inline int cma_any_addr(struct sockaddr *addr)
return cma_zero_addr(addr) || cma_loopback_addr(addr);
}
+static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+{
+ if (src->sa_family != dst->sa_family)
+ return -1;
+
+ switch (src->sa_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
+ ((struct sockaddr_in *) dst)->sin_addr.s_addr;
+ default:
+ return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
+ &((struct sockaddr_in6 *) dst)->sin6_addr);
+ }
+}
+
static inline __be16 cma_port(struct sockaddr *addr)
{
if (addr->sa_family == AF_INET)
@@ -831,16 +921,16 @@ static void cma_cancel_listens(struct rdma_id_private *id_priv)
}
static void cma_cancel_operation(struct rdma_id_private *id_priv,
- enum cma_state state)
+ enum rdma_cm_state state)
{
switch (state) {
- case CMA_ADDR_QUERY:
+ case RDMA_CM_ADDR_QUERY:
rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
break;
- case CMA_ROUTE_QUERY:
+ case RDMA_CM_ROUTE_QUERY:
cma_cancel_route(id_priv);
break;
- case CMA_LISTEN:
+ case RDMA_CM_LISTEN:
if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)
&& !id_priv->cma_dev)
cma_cancel_listens(id_priv);
@@ -852,20 +942,21 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv,
static void cma_release_port(struct rdma_id_private *id_priv)
{
- struct rdma_bind_list *bind_list = id_priv->bind_list;
-
- if (!bind_list)
- return;
+ struct rdma_bind_list *bind_list;
mutex_lock(&lock);
+ bind_list = id_priv->bind_list;
+ if (!bind_list) {
+ mutex_unlock(&lock);
+ return;
+ }
hlist_del(&id_priv->node);
+ id_priv->bind_list = NULL;
if (hlist_empty(&bind_list->owners)) {
idr_remove(bind_list->ps, bind_list->port);
kfree(bind_list);
}
mutex_unlock(&lock);
- if (id_priv->sock)
- sock_release(id_priv->sock);
}
static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
@@ -889,46 +980,66 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
}
}
}
+static void __rdma_free(struct work_struct *work)
+{
+ struct rdma_id_private *id_priv;
+ id_priv = container_of(work, struct rdma_id_private, work);
+
+ wait_for_completion(&id_priv->comp);
+
+ if (id_priv->internal_id)
+ cma_deref_id(id_priv->id.context);
+
+ kfree(id_priv->id.route.path_rec);
+ kfree(id_priv);
+}
void rdma_destroy_id(struct rdma_cm_id *id)
{
struct rdma_id_private *id_priv;
- enum cma_state state;
+ enum rdma_cm_state state;
+ unsigned long flags;
+ struct ib_cm_id *ib;
id_priv = container_of(id, struct rdma_id_private, id);
- state = cma_exch(id_priv, CMA_DESTROYING);
+ state = cma_exch(id_priv, RDMA_CM_DESTROYING);
cma_cancel_operation(id_priv, state);
- mutex_lock(&lock);
+ /*
+ * Wait for any active callback to finish. New callbacks will find
+ * the id_priv state set to destroying and abort.
+ */
+ mutex_lock(&id_priv->handler_mutex);
+ mutex_unlock(&id_priv->handler_mutex);
+
if (id_priv->cma_dev) {
- mutex_unlock(&lock);
switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
case RDMA_TRANSPORT_IB:
- if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
- ib_destroy_cm_id(id_priv->cm_id.ib);
+ spin_lock_irqsave(&id_priv->cm_lock, flags);
+ if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) {
+ ib = id_priv->cm_id.ib;
+ id_priv->cm_id.ib = NULL;
+ spin_unlock_irqrestore(&id_priv->cm_lock, flags);
+ ib_destroy_cm_id(ib);
+ } else
+ spin_unlock_irqrestore(&id_priv->cm_lock, flags);
break;
case RDMA_TRANSPORT_IWARP:
- if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
+ case RDMA_TRANSPORT_SCIF:
+ if (id_priv->cm_id.iw)
iw_destroy_cm_id(id_priv->cm_id.iw);
break;
default:
break;
}
cma_leave_mc_groups(id_priv);
- mutex_lock(&lock);
- cma_detach_from_dev(id_priv);
+ cma_release_dev(id_priv);
}
- mutex_unlock(&lock);
cma_release_port(id_priv);
cma_deref_id(id_priv);
- wait_for_completion(&id_priv->comp);
-
- if (id_priv->internal_id)
- cma_deref_id(id_priv->id.context);
-
- kfree(id_priv->id.route.path_rec);
- kfree(id_priv);
+ INIT_WORK(&id_priv->work, __rdma_free);
+ queue_work(cma_free_wq, &id_priv->work);
}
EXPORT_SYMBOL(rdma_destroy_id);
@@ -944,6 +1055,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
if (ret)
goto reject;
+ cma_dbg(id_priv, "sending RTU\n");
ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
if (ret)
goto reject;
@@ -951,6 +1063,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
return 0;
reject:
cma_modify_qp_err(id_priv);
+ cma_dbg(id_priv, "sending REJ\n");
ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
NULL, 0, NULL, 0);
return ret;
@@ -987,11 +1100,10 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
int ret = 0;
if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
- cma_disable_callback(id_priv, CMA_CONNECT)) ||
+ cma_disable_callback(id_priv, RDMA_CM_CONNECT)) ||
(ib_event->event == IB_CM_TIMEWAIT_EXIT &&
- cma_disable_callback(id_priv, CMA_DISCONNECT)))
+ cma_disable_callback(id_priv, RDMA_CM_DISCONNECT)))
return 0;
-
memset(&event, 0, sizeof event);
switch (ib_event->event) {
case IB_CM_REQ_ERROR:
@@ -1020,7 +1132,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
event.status = -ETIMEDOUT; /* fall through */
case IB_CM_DREQ_RECEIVED:
case IB_CM_DREP_RECEIVED:
- if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
+ if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+ RDMA_CM_DISCONNECT))
goto out;
event.event = RDMA_CM_EVENT_DISCONNECTED;
break;
@@ -1047,7 +1160,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
if (ret) {
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
- cma_exch(id_priv, CMA_DESTROYING);
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return ret;
@@ -1070,12 +1183,12 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
if (cma_get_net_info(ib_event->private_data, listen_id->ps,
&ip_ver, &port, &src, &dst))
- goto err;
+ return NULL;
id = rdma_create_id(listen_id->event_handler, listen_id->context,
- listen_id->ps);
+ listen_id->ps, ib_event->param.req_rcvd.qp_type);
if (IS_ERR(id))
- goto err;
+ return NULL;
cma_save_net_info(&id->route.addr, &listen_id->route.addr,
ip_ver, port, src, dst);
@@ -1085,7 +1198,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
GFP_KERNEL);
if (!rt->path_rec)
- goto destroy_id;
+ goto err;
rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
if (rt->num_paths == 2)
@@ -1094,22 +1207,21 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) {
rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
- ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey);
+ ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
} else {
ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr,
- &rt->addr.dev_addr);
+ &rt->addr.dev_addr, NULL);
if (ret)
- goto destroy_id;
+ goto err;
}
rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
id_priv = container_of(id, struct rdma_id_private, id);
- id_priv->state = CMA_CONNECT;
+ id_priv->state = RDMA_CM_CONNECT;
return id_priv;
-destroy_id:
- rdma_destroy_id(id);
err:
+ rdma_destroy_id(id);
return NULL;
}
@@ -1124,7 +1236,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
int ret;
id = rdma_create_id(listen_id->event_handler, listen_id->context,
- listen_id->ps);
+ listen_id->ps, IB_QPT_UD);
if (IS_ERR(id))
return NULL;
@@ -1138,13 +1250,13 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr,
- &id->route.addr.dev_addr);
+ &id->route.addr.dev_addr, NULL);
if (ret)
goto err;
}
id_priv = container_of(id, struct rdma_id_private, id);
- id_priv->state = CMA_CONNECT;
+ id_priv->state = RDMA_CM_CONNECT;
return id_priv;
err:
rdma_destroy_id(id);
@@ -1166,20 +1278,43 @@ static void cma_set_req_event_data(struct rdma_cm_event *event,
event->param.conn.qp_num = req_data->remote_qpn;
}
+static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event)
+{
+ return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
+ (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
+ ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
+ (id->qp_type == IB_QPT_UD)) ||
+ (!id->qp_type));
+}
+
static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
{
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
int offset, ret;
+ u8 smac[ETH_ALEN];
+ u8 alt_smac[ETH_ALEN];
+ u8 *psmac = smac;
+ u8 *palt_smac = alt_smac;
+ int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) ==
+ RDMA_TRANSPORT_IB) &&
+ (rdma_port_get_link_layer(cm_id->device,
+ ib_event->param.req_rcvd.port) ==
+ IB_LINK_LAYER_ETHERNET));
+ int is_sidr = 0;
listen_id = cm_id->context;
- if (cma_disable_callback(listen_id, CMA_LISTEN))
+ if (!cma_check_req_qp_type(&listen_id->id, ib_event))
+ return -EINVAL;
+
+ if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
return -ECONNABORTED;
memset(&event, 0, sizeof event);
offset = cma_user_data_offset(listen_id->id.ps);
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
- if (cma_is_ud_ps(listen_id->id.ps)) {
+ if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
+ is_sidr = 1;
conn_id = cma_new_udp_id(&listen_id->id, ib_event);
event.param.ud.private_data = ib_event->private_data + offset;
event.param.ud.private_data_len =
@@ -1191,45 +1326,69 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
}
if (!conn_id) {
ret = -ENOMEM;
- goto out;
+ goto err1;
}
mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
- mutex_lock(&lock);
ret = cma_acquire_dev(conn_id);
- mutex_unlock(&lock);
if (ret)
- goto release_conn_id;
+ goto err2;
conn_id->cm_id.ib = cm_id;
cm_id->context = conn_id;
cm_id->cm_handler = cma_ib_handler;
+ /*
+ * Protect against the user destroying conn_id from another thread
+ * until we're done accessing it.
+ */
+ atomic_inc(&conn_id->refcount);
ret = conn_id->id.event_handler(&conn_id->id, &event);
- if (!ret) {
+ if (ret)
+ goto err3;
+
+ if (is_iboe && !is_sidr) {
+ if (ib_event->param.req_rcvd.primary_path != NULL)
+ rdma_addr_find_smac_by_sgid(
+ &ib_event->param.req_rcvd.primary_path->sgid,
+ psmac, NULL);
+ else
+ psmac = NULL;
+ if (ib_event->param.req_rcvd.alternate_path != NULL)
+ rdma_addr_find_smac_by_sgid(
+ &ib_event->param.req_rcvd.alternate_path->sgid,
+ palt_smac, NULL);
+ else
+ palt_smac = NULL;
+ }
/*
* Acquire mutex to prevent user executing rdma_destroy_id()
* while we're accessing the cm_id.
*/
mutex_lock(&lock);
- if (cma_comp(conn_id, CMA_CONNECT) &&
- !cma_is_ud_ps(conn_id->id.ps))
+ if (is_iboe && !is_sidr)
+ ib_update_cm_av(cm_id, psmac, palt_smac);
+ if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) {
+ cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n");
ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ }
mutex_unlock(&lock);
mutex_unlock(&conn_id->handler_mutex);
- goto out;
- }
+ mutex_unlock(&listen_id->handler_mutex);
+ cma_deref_id(conn_id);
+ return 0;
+err3:
+ cma_deref_id(conn_id);
/* Destroy the CM ID by returning a non-zero value. */
conn_id->cm_id.ib = NULL;
-
-release_conn_id:
- cma_exch(conn_id, CMA_DESTROYING);
+err2:
+ cma_exch(conn_id, RDMA_CM_DESTROYING);
mutex_unlock(&conn_id->handler_mutex);
- rdma_destroy_id(&conn_id->id);
-
-out:
+err1:
mutex_unlock(&listen_id->handler_mutex);
+ if (conn_id)
+ rdma_destroy_id(&conn_id->id);
return ret;
}
@@ -1244,9 +1403,7 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
struct cma_hdr *cma_data, *cma_mask;
struct sdp_hh *sdp_data, *sdp_mask;
__be32 ip4_addr;
-#ifdef INET6
struct in6_addr ip6_addr;
-#endif
memset(compare, 0, sizeof *compare);
cma_data = (void *) compare->data;
@@ -1260,33 +1417,39 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
if (ps == RDMA_PS_SDP) {
sdp_set_ip_ver(sdp_data, 4);
sdp_set_ip_ver(sdp_mask, 0xF);
- sdp_data->dst_addr.ip4.addr = ip4_addr;
- sdp_mask->dst_addr.ip4.addr = htonl(~0);
+ if (!cma_any_addr(addr)) {
+ sdp_data->dst_addr.ip4.addr = ip4_addr;
+ sdp_mask->dst_addr.ip4.addr = htonl(~0);
+ }
} else {
cma_set_ip_ver(cma_data, 4);
cma_set_ip_ver(cma_mask, 0xF);
- cma_data->dst_addr.ip4.addr = ip4_addr;
- cma_mask->dst_addr.ip4.addr = htonl(~0);
+ if (!cma_any_addr(addr)) {
+ cma_data->dst_addr.ip4.addr = ip4_addr;
+ cma_mask->dst_addr.ip4.addr = htonl(~0);
+ }
}
break;
-#ifdef INET6
case AF_INET6:
ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
if (ps == RDMA_PS_SDP) {
sdp_set_ip_ver(sdp_data, 6);
sdp_set_ip_ver(sdp_mask, 0xF);
- sdp_data->dst_addr.ip6 = ip6_addr;
- memset(&sdp_mask->dst_addr.ip6, 0xFF,
- sizeof sdp_mask->dst_addr.ip6);
+ if (!cma_any_addr(addr)) {
+ sdp_data->dst_addr.ip6 = ip6_addr;
+ memset(&sdp_mask->dst_addr.ip6, 0xFF,
+ sizeof(sdp_mask->dst_addr.ip6));
+ }
} else {
cma_set_ip_ver(cma_data, 6);
cma_set_ip_ver(cma_mask, 0xF);
- cma_data->dst_addr.ip6 = ip6_addr;
- memset(&cma_mask->dst_addr.ip6, 0xFF,
- sizeof cma_mask->dst_addr.ip6);
+ if (!cma_any_addr(addr)) {
+ cma_data->dst_addr.ip6 = ip6_addr;
+ memset(&cma_mask->dst_addr.ip6, 0xFF,
+ sizeof(cma_mask->dst_addr.ip6));
+ }
}
break;
-#endif
default:
break;
}
@@ -1299,7 +1462,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
struct sockaddr_in *sin;
int ret = 0;
- if (cma_disable_callback(id_priv, CMA_CONNECT))
+ if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
return 0;
memset(&event, 0, sizeof event);
@@ -1315,6 +1478,8 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
switch ((int)iw_event->status) {
case 0:
event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
break;
case -ECONNRESET:
case -ECONNREFUSED:
@@ -1330,6 +1495,8 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
break;
case IW_CM_EVENT_ESTABLISHED:
event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
break;
default:
BUG_ON(1);
@@ -1342,7 +1509,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
if (ret) {
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.iw = NULL;
- cma_exch(id_priv, CMA_DESTROYING);
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return ret;
@@ -1364,22 +1531,22 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
struct ib_device_attr attr;
listen_id = cm_id->context;
- if (cma_disable_callback(listen_id, CMA_LISTEN))
+ if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
return -ECONNABORTED;
/* Create a new RDMA id for the new IW CM ID */
new_cm_id = rdma_create_id(listen_id->id.event_handler,
listen_id->id.context,
- RDMA_PS_TCP);
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(new_cm_id)) {
ret = -ENOMEM;
goto out;
}
conn_id = container_of(new_cm_id, struct rdma_id_private, id);
mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
- conn_id->state = CMA_CONNECT;
+ conn_id->state = RDMA_CM_CONNECT;
- dev = ip_dev_find(NULL, iw_event->local_addr.sin_addr.s_addr);
+ dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr);
if (!dev) {
ret = -EADDRNOTAVAIL;
mutex_unlock(&conn_id->handler_mutex);
@@ -1393,9 +1560,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
goto out;
}
- mutex_lock(&lock);
ret = cma_acquire_dev(conn_id);
- mutex_unlock(&lock);
if (ret) {
mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(new_cm_id);
@@ -1422,19 +1587,27 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
event.param.conn.private_data = iw_event->private_data;
event.param.conn.private_data_len = iw_event->private_data_len;
- event.param.conn.initiator_depth = attr.max_qp_init_rd_atom;
- event.param.conn.responder_resources = attr.max_qp_rd_atom;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+
+ /*
+ * Protect against the user destroying conn_id from another thread
+ * until we're done accessing it.
+ */
+ atomic_inc(&conn_id->refcount);
ret = conn_id->id.event_handler(&conn_id->id, &event);
if (ret) {
/* User wants to destroy the CM ID */
conn_id->cm_id.iw = NULL;
- cma_exch(conn_id, CMA_DESTROYING);
+ cma_exch(conn_id, RDMA_CM_DESTROYING);
mutex_unlock(&conn_id->handler_mutex);
+ cma_deref_id(conn_id);
rdma_destroy_id(&conn_id->id);
goto out;
}
mutex_unlock(&conn_id->handler_mutex);
+ cma_deref_id(conn_id);
out:
if (dev)
@@ -1447,17 +1620,19 @@ static int cma_ib_listen(struct rdma_id_private *id_priv)
{
struct ib_cm_compare_data compare_data;
struct sockaddr *addr;
+ struct ib_cm_id *id;
__be64 svc_id;
int ret;
- id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler,
- id_priv);
- if (IS_ERR(id_priv->cm_id.ib))
- return PTR_ERR(id_priv->cm_id.ib);
+ id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+
+ id_priv->cm_id.ib = id;
addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
svc_id = cma_get_service_id(id_priv->id.ps, addr);
- if (cma_any_addr(addr))
+ if (cma_any_addr(addr) && !id_priv->afonly)
ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
else {
cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
@@ -1476,13 +1651,16 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
{
int ret;
struct sockaddr_in *sin;
+ struct iw_cm_id *id;
- id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device,
+ id = iw_create_cm_id(id_priv->id.device,
id_priv->sock,
iw_conn_req_handler,
id_priv);
- if (IS_ERR(id_priv->cm_id.iw))
- return PTR_ERR(id_priv->cm_id.iw);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+
+ id_priv->cm_id.iw = id;
sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
id_priv->cm_id.iw->local_addr = *sin;
@@ -1514,13 +1692,14 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
struct rdma_cm_id *id;
int ret;
- id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
+ id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+ id_priv->id.qp_type);
if (IS_ERR(id))
return;
dev_id_priv = container_of(id, struct rdma_id_private, id);
- dev_id_priv->state = CMA_ADDR_BOUND;
+ dev_id_priv->state = RDMA_CM_ADDR_BOUND;
memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
@@ -1528,11 +1707,11 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
atomic_inc(&id_priv->refcount);
dev_id_priv->internal_id = 1;
+ dev_id_priv->afonly = id_priv->afonly;
ret = rdma_listen(id, id_priv->backlog);
if (ret)
- printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
- "listening on device %s\n", ret, cma_dev->device->name);
+ cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name);
}
static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -1546,58 +1725,23 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv)
mutex_unlock(&lock);
}
-int rdma_listen(struct rdma_cm_id *id, int backlog)
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
{
struct rdma_id_private *id_priv;
- int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (id_priv->state == CMA_IDLE) {
- ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
- ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
- if (ret)
- return ret;
- }
-
- if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
- return -EINVAL;
-
- id_priv->backlog = backlog;
- if (id->device) {
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- ret = cma_ib_listen(id_priv);
- if (ret)
- goto err;
- break;
- case RDMA_TRANSPORT_IWARP:
- ret = cma_iw_listen(id_priv, backlog);
- if (ret)
- goto err;
- break;
- default:
- ret = -ENOSYS;
- goto err;
- }
- } else
- cma_listen_on_all(id_priv);
-
- return 0;
-err:
- id_priv->backlog = 0;
- cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
- return ret;
+ id_priv->tos = (u8) tos;
}
-EXPORT_SYMBOL(rdma_listen);
+EXPORT_SYMBOL(rdma_set_service_type);
-void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+void rdma_set_timeout(struct rdma_cm_id *id, int timeout)
{
struct rdma_id_private *id_priv;
id_priv = container_of(id, struct rdma_id_private, id);
- id_priv->tos = (u8) tos;
+ id_priv->qp_timeout = (u8) timeout;
}
-EXPORT_SYMBOL(rdma_set_service_type);
+EXPORT_SYMBOL(rdma_set_timeout);
static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
void *context)
@@ -1611,8 +1755,8 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
route->num_paths = 1;
*route->path_rec = *path_rec;
} else {
- work->old_state = CMA_ROUTE_QUERY;
- work->new_state = CMA_ADDR_RESOLVED;
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
work->event.status = status;
}
@@ -1650,11 +1794,6 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
}
- if (tavor_quirk) {
- path_rec.mtu_selector = IB_SA_LT;
- path_rec.mtu = IB_MTU_2048;
- }
-
id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
id_priv->id.port_num, &path_rec,
comp_mask, timeout_ms,
@@ -1675,7 +1814,7 @@ static void cma_work_handler(struct work_struct *_work)
goto out;
if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
- cma_exch(id_priv, CMA_DESTROYING);
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
destroy = 1;
}
out:
@@ -1693,12 +1832,12 @@ static void cma_ndev_work_handler(struct work_struct *_work)
int destroy = 0;
mutex_lock(&id_priv->handler_mutex);
- if (id_priv->state == CMA_DESTROYING ||
- id_priv->state == CMA_DEVICE_REMOVAL)
+ if (id_priv->state == RDMA_CM_DESTROYING ||
+ id_priv->state == RDMA_CM_DEVICE_REMOVAL)
goto out;
if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
- cma_exch(id_priv, CMA_DESTROYING);
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
destroy = 1;
}
@@ -1722,8 +1861,8 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
work->id = id_priv;
INIT_WORK(&work->work, cma_work_handler);
- work->old_state = CMA_ROUTE_QUERY;
- work->new_state = CMA_ROUTE_RESOLVED;
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
@@ -1752,19 +1891,21 @@ int rdma_set_ib_paths(struct rdma_cm_id *id,
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_RESOLVED))
return -EINVAL;
- id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL);
+ id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths,
+ GFP_KERNEL);
if (!id->route.path_rec) {
ret = -ENOMEM;
goto err;
}
- memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths);
+ id->route.num_paths = num_paths;
return 0;
err:
- cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
return ret;
}
EXPORT_SYMBOL(rdma_set_ib_paths);
@@ -1779,8 +1920,8 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
work->id = id_priv;
INIT_WORK(&work->work, cma_work_handler);
- work->old_state = CMA_ROUTE_QUERY;
- work->new_state = CMA_ROUTE_RESOLVED;
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
queue_work(cma_wq, &work->work);
return 0;
@@ -1800,7 +1941,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr;
struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr;
struct net_device *ndev = NULL;
- u16 vid;
+
if (src_addr->sin_family != dst_addr->sin_family)
return -EINVAL;
@@ -1827,10 +1968,15 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
goto err2;
}
- vid = rdma_vlan_dev_vlan_id(ndev);
+ route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev);
+ memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN);
+ memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen);
+
- iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid);
- iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &route->path_rec->sgid);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
+ &route->path_rec->dgid);
route->path_rec->hop_limit = 1;
route->path_rec->reversible = 1;
@@ -1838,23 +1984,19 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
route->path_rec->mtu_selector = IB_SA_EQ;
route->path_rec->sl = tos_to_sl(id_priv->tos);
-#ifdef __linux__
- route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
-#else
route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu);
-#endif
route->path_rec->rate_selector = IB_SA_EQ;
route->path_rec->rate = iboe_get_rate(ndev);
dev_put(ndev);
route->path_rec->packet_life_time_selector = IB_SA_EQ;
- route->path_rec->packet_life_time = IBOE_PACKET_LIFETIME;
+ route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
if (!route->path_rec->mtu) {
ret = -EINVAL;
goto err2;
}
- work->old_state = CMA_ROUTE_QUERY;
- work->new_state = CMA_ROUTE_RESOLVED;
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
work->event.status = 0;
@@ -1876,7 +2018,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
return -EINVAL;
atomic_inc(&id_priv->refcount);
@@ -1894,6 +2036,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
}
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
ret = cma_resolve_iw_route(id_priv, timeout_ms);
break;
default:
@@ -1905,12 +2048,19 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
return 0;
err:
- cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
cma_deref_id(id_priv);
return ret;
}
EXPORT_SYMBOL(rdma_resolve_route);
+int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type)
+{
+ /* APM is not supported yet */
+ return -EINVAL;
+}
+EXPORT_SYMBOL(rdma_enable_apm);
+
static int cma_bind_loopback(struct rdma_id_private *id_priv)
{
struct cma_device *cma_dev;
@@ -1964,34 +2114,26 @@ static void addr_handler(int status, struct sockaddr *src_addr,
memset(&event, 0, sizeof event);
mutex_lock(&id_priv->handler_mutex);
-
- /*
- * Grab mutex to block rdma_destroy_id() from removing the device while
- * we're trying to acquire it.
- */
- mutex_lock(&lock);
- if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) {
- mutex_unlock(&lock);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+ RDMA_CM_ADDR_RESOLVED))
goto out;
- }
+ memcpy(&id_priv->id.route.addr.src_addr, src_addr,
+ ip_addr_size(src_addr));
if (!status && !id_priv->cma_dev)
status = cma_acquire_dev(id_priv);
- mutex_unlock(&lock);
if (status) {
- if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND))
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ADDR_BOUND))
goto out;
event.event = RDMA_CM_EVENT_ADDR_ERROR;
event.status = status;
- } else {
- memcpy(&id_priv->id.route.addr.src_addr, src_addr,
- ip_addr_size(src_addr));
+ } else
event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
- }
if (id_priv->id.event_handler(&id_priv->id, &event)) {
- cma_exch(id_priv, CMA_DESTROYING);
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
mutex_unlock(&id_priv->handler_mutex);
cma_deref_id(id_priv);
rdma_destroy_id(&id_priv->id);
@@ -2026,18 +2168,18 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv)
if (cma_zero_addr(src)) {
dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
if ((src->sa_family = dst->sa_family) == AF_INET) {
- ((struct sockaddr_in *) src)->sin_addr.s_addr =
- ((struct sockaddr_in *) dst)->sin_addr.s_addr;
+ ((struct sockaddr_in *)src)->sin_addr =
+ ((struct sockaddr_in *)dst)->sin_addr;
} else {
- ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr,
- &((struct sockaddr_in6 *) dst)->sin6_addr);
+ ((struct sockaddr_in6 *)src)->sin6_addr =
+ ((struct sockaddr_in6 *)dst)->sin6_addr;
}
}
work->id = id_priv;
INIT_WORK(&work->work, cma_work_handler);
- work->old_state = CMA_ADDR_QUERY;
- work->new_state = CMA_ADDR_RESOLVED;
+ work->old_state = RDMA_CM_ADDR_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
queue_work(cma_wq, &work->work);
return 0;
@@ -2046,6 +2188,25 @@ err:
return ret;
}
+static int cma_resolve_scif(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ /* we probably can leave it empty here */
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ADDR_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ queue_work(cma_wq, &work->work);
+ return 0;
+}
+
static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
struct sockaddr *dst_addr)
{
@@ -2061,11 +2222,12 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
else {
struct sockaddr_in addr_in;
- memset(&addr_in, 0, sizeof addr_in);
- addr_in.sin_family = dst_addr->sa_family;
- addr_in.sin_len = sizeof addr_in;
- return rdma_bind_addr(id, (struct sockaddr *) &addr_in);
+ memset(&addr_in, 0, sizeof addr_in);
+ addr_in.sin_family = dst_addr->sa_family;
+ addr_in.sin_len = sizeof addr_in;
+ return rdma_bind_addr(id, (struct sockaddr *) &addr_in);
}
+
}
int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
@@ -2075,19 +2237,22 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (id_priv->state == CMA_IDLE) {
+ if (id_priv->state == RDMA_CM_IDLE) {
ret = cma_bind_addr(id, src_addr, dst_addr);
if (ret)
return ret;
}
- if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
return -EINVAL;
atomic_inc(&id_priv->refcount);
memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
if (cma_any_addr(dst_addr))
ret = cma_resolve_loopback(id_priv);
+ else if (id_priv->id.device &&
+ rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF)
+ ret = cma_resolve_scif(id_priv);
else
ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr,
dst_addr, &id->route.addr.dev_addr,
@@ -2097,12 +2262,51 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
return 0;
err:
- cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND);
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
cma_deref_id(id_priv);
return ret;
}
EXPORT_SYMBOL(rdma_resolve_addr);
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (id_priv->state == RDMA_CM_IDLE) {
+ id_priv->reuseaddr = reuse;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
+ id_priv->options |= (1 << CMA_OPTION_AFONLY);
+ id_priv->afonly = afonly;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_afonly);
+
static void cma_bind_port(struct rdma_bind_list *bind_list,
struct rdma_id_private *id_priv)
{
@@ -2149,126 +2353,100 @@ err1:
static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
{
-#if defined(INET)
- struct rdma_bind_list *bind_list;
- int port, ret, low, high;
-
- bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
- if (!bind_list)
- return -ENOMEM;
-
-retry:
- /* FIXME: add proper port randomization per like inet_csk_get_port */
- do {
- ret = idr_get_new_above(ps, bind_list, next_port, &port);
- } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
-
- if (ret)
- goto err1;
+ static unsigned int last_used_port;
+ int low, high, remaining;
+ unsigned int rover;
inet_get_local_port_range(&low, &high);
- if (port > high) {
- if (next_port != low) {
- idr_remove(ps, port);
- next_port = low;
- goto retry;
+ remaining = (high - low) + 1;
+ rover = random() % remaining + low;
+retry:
+ if (last_used_port != rover &&
+ !idr_find(ps, (unsigned short) rover)) {
+ int ret = cma_alloc_port(ps, id_priv, rover);
+ /*
+ * Remember previously used port number in order to avoid
+ * re-using same port immediately after it is closed.
+ */
+ if (!ret)
+ last_used_port = rover;
+ if (ret != -EADDRNOTAVAIL)
+ return ret;
}
- ret = -EADDRNOTAVAIL;
- goto err2;
+ if (--remaining) {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ goto retry;
}
-
- if (port == high)
- next_port = low;
- else
- next_port = port + 1;
-
- bind_list->ps = ps;
- bind_list->port = (unsigned short) port;
- cma_bind_port(bind_list, id_priv);
- return 0;
-err2:
- idr_remove(ps, port);
-err1:
- kfree(bind_list);
- return ret;
-#else
- return -ENOSPC;
-#endif
+ return -EADDRNOTAVAIL;
}
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+/*
+ * Check that the requested port is available. This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port. In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv, uint8_t reuseaddr)
{
struct rdma_id_private *cur_id;
- struct sockaddr_in *sin, *cur_sin;
- struct rdma_bind_list *bind_list;
+ struct sockaddr *addr, *cur_addr;
struct hlist_node *node;
- unsigned short snum;
- sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
- snum = ntohs(sin->sin_port);
-#ifdef __linux__
- if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
- return -EACCES;
-#endif
+ addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+ hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
+ if (id_priv == cur_id)
+ continue;
- bind_list = idr_find(ps, snum);
- if (!bind_list)
- return cma_alloc_port(ps, id_priv, snum);
+ if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr &&
+ cur_id->reuseaddr)
+ continue;
- /*
- * We don't support binding to any address if anyone is bound to
- * a specific address on the same port.
- */
- if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr))
- return -EADDRNOTAVAIL;
+ cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr;
+ if (id_priv->afonly && cur_id->afonly &&
+ (addr->sa_family != cur_addr->sa_family))
+ continue;
- hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
- if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr))
+ if (cma_any_addr(addr) || cma_any_addr(cur_addr))
return -EADDRNOTAVAIL;
- cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr;
- if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr)
+ if (!cma_addr_cmp(addr, cur_addr))
return -EADDRINUSE;
}
-
- cma_bind_port(bind_list, id_priv);
return 0;
}
-static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
{
+ struct rdma_bind_list *bind_list;
+ unsigned short snum;
int ret;
- int size;
- struct socket *sock;
- ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret)
- return ret;
-#ifdef __linux__
- ret = sock->ops->bind(sock,
- (struct sockaddr *) &id_priv->id.route.addr.src_addr,
- ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
-#else
- ret = -sobind(sock,
- (struct sockaddr *)&id_priv->id.route.addr.src_addr,
- curthread);
-#endif
- if (ret) {
- sock_release(sock);
- return ret;
- }
+ snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr));
- size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
- ret = sock_getname(sock,
- (struct sockaddr *) &id_priv->id.route.addr.src_addr,
- &size, 0);
- if (ret) {
- sock_release(sock);
- return ret;
+ bind_list = idr_find(ps, snum);
+ if (!bind_list) {
+ ret = cma_alloc_port(ps, id_priv, snum);
+ } else {
+ ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+ if (!ret)
+ cma_bind_port(bind_list, id_priv);
}
+ return ret;
+}
- id_priv->sock = sock;
- return 0;
+static int cma_bind_listen(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+ int ret = 0;
+
+ mutex_lock(&lock);
+ if (bind_list->owners.first->next)
+ ret = cma_check_port(bind_list, id_priv, 0);
+ mutex_unlock(&lock);
+ return ret;
}
static int cma_get_port(struct rdma_id_private *id_priv)
@@ -2282,11 +2460,6 @@ static int cma_get_port(struct rdma_id_private *id_priv)
break;
case RDMA_PS_TCP:
ps = &tcp_ps;
- if (unify_tcp_port_space) {
- ret = cma_get_tcp_port(id_priv);
- if (ret)
- goto out;
- }
break;
case RDMA_PS_UDP:
ps = &udp_ps;
@@ -2294,6 +2467,9 @@ static int cma_get_port(struct rdma_id_private *id_priv)
case RDMA_PS_IPOIB:
ps = &ipoib_ps;
break;
+ case RDMA_PS_IB:
+ ps = &ib_ps;
+ break;
default:
return -EPROTONOSUPPORT;
}
@@ -2304,7 +2480,7 @@ static int cma_get_port(struct rdma_id_private *id_priv)
else
ret = cma_use_port(ps, id_priv);
mutex_unlock(&lock);
-out:
+
return ret;
}
@@ -2318,11 +2494,7 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
return 0;
sin6 = (struct sockaddr_in6 *) addr;
-#ifdef __linux__
- if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
-#else
if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) &&
-#endif
!sin6->sin6_scope_id)
return -EINVAL;
@@ -2331,48 +2503,105 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
return 0;
}
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == RDMA_CM_IDLE) {
+ ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
+ ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
+ if (ret)
+ return ret;
+ }
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))
+ return -EINVAL;
+
+ if (id_priv->reuseaddr) {
+ ret = cma_bind_listen(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ id_priv->backlog = backlog;
+ if (id->device) {
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_ib_listen(id_priv);
+ if (ret)
+ goto err;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
+ ret = cma_iw_listen(id_priv, backlog);
+ if (ret)
+ goto err;
+ break;
+ default:
+ ret = -ENOSYS;
+ goto err;
+ }
+ } else
+ cma_listen_on_all(id_priv);
+
+ return 0;
+err:
+ id_priv->backlog = 0;
+ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
{
struct rdma_id_private *id_priv;
int ret;
+ int ipv6only;
+ size_t var_size = sizeof(int);
if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
return -EAFNOSUPPORT;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
+ if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
return -EINVAL;
ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
if (ret)
goto err1;
+ memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
if (!cma_any_addr(addr)) {
- ret = rdma_translate_ip(addr, &id->route.addr.dev_addr);
+ ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL);
if (ret)
goto err1;
- mutex_lock(&lock);
ret = cma_acquire_dev(id_priv);
- mutex_unlock(&lock);
if (ret)
goto err1;
}
- memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
+ if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
+ if (addr->sa_family == AF_INET)
+ id_priv->afonly = 1;
+#if defined(INET6)
+ else if (addr->sa_family == AF_INET6)
+ id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only",
+ &ipv6only, &var_size, NULL, 0, NULL, 0);
+#endif
+ }
ret = cma_get_port(id_priv);
if (ret)
goto err2;
return 0;
err2:
- if (id_priv->cma_dev) {
- mutex_lock(&lock);
- cma_detach_from_dev(id_priv);
- mutex_unlock(&lock);
- }
+ if (id_priv->cma_dev)
+ cma_release_dev(id_priv);
err1:
- cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
return ret;
}
EXPORT_SYMBOL(rdma_bind_addr);
@@ -2445,7 +2674,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
int ret = 0;
- if (cma_disable_callback(id_priv, CMA_CONNECT))
+ if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
return 0;
memset(&event, 0, sizeof event);
@@ -2491,7 +2720,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
if (ret) {
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
- cma_exch(id_priv, CMA_DESTROYING);
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return ret;
@@ -2506,10 +2735,14 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
{
struct ib_cm_sidr_req_param req;
struct rdma_route *route;
+ struct ib_cm_id *id;
int ret;
req.private_data_len = sizeof(struct cma_hdr) +
conn_param->private_data_len;
+ if (req.private_data_len < conn_param->private_data_len)
+ return -EINVAL;
+
req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
if (!req.private_data)
return -ENOMEM;
@@ -2523,12 +2756,13 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
if (ret)
goto out;
- id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device,
- cma_sidr_rep_handler, id_priv);
- if (IS_ERR(id_priv->cm_id.ib)) {
- ret = PTR_ERR(id_priv->cm_id.ib);
+ id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
+ id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
goto out;
}
+ id_priv->cm_id.ib = id;
req.path = route->path_rec;
req.service_id = cma_get_service_id(id_priv->id.ps,
@@ -2536,6 +2770,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
req.timeout_ms = 1 << (cma_response_timeout - 8);
req.max_cm_retries = CMA_MAX_CM_RETRIES;
+ cma_dbg(id_priv, "sending SIDR\n");
ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
if (ret) {
ib_destroy_cm_id(id_priv->cm_id.ib);
@@ -2552,11 +2787,15 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
struct ib_cm_req_param req;
struct rdma_route *route;
void *private_data;
+ struct ib_cm_id *id;
int offset, ret;
memset(&req, 0, sizeof req);
offset = cma_user_data_offset(id_priv->id.ps);
req.private_data_len = offset + conn_param->private_data_len;
+ if (req.private_data_len < conn_param->private_data_len)
+ return -EINVAL;
+
private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
if (!private_data)
return -ENOMEM;
@@ -2565,12 +2804,12 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
memcpy(private_data + offset, conn_param->private_data,
conn_param->private_data_len);
- id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler,
- id_priv);
- if (IS_ERR(id_priv->cm_id.ib)) {
- ret = PTR_ERR(id_priv->cm_id.ib);
+ id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
goto out;
}
+ id_priv->cm_id.ib = id;
route = &id_priv->id.route;
ret = cma_format_hdr(private_data, id_priv->id.ps, route);
@@ -2585,22 +2824,23 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
req.service_id = cma_get_service_id(id_priv->id.ps,
(struct sockaddr *) &route->addr.dst_addr);
req.qp_num = id_priv->qp_num;
- req.qp_type = IB_QPT_RC;
+ req.qp_type = id_priv->id.qp_type;
req.starting_psn = id_priv->seq_num;
req.responder_resources = conn_param->responder_resources;
req.initiator_depth = conn_param->initiator_depth;
req.flow_control = conn_param->flow_control;
- req.retry_count = conn_param->retry_count;
- req.rnr_retry_count = conn_param->rnr_retry_count;
+ req.retry_count = min_t(u8, 7, conn_param->retry_count);
+ req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
req.remote_cm_response_timeout = cma_response_timeout;
req.local_cm_response_timeout = cma_response_timeout;
req.max_cm_retries = CMA_MAX_CM_RETRIES;
req.srq = id_priv->srq ? 1 : 0;
+ cma_dbg(id_priv, "sending REQ\n");
ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
out:
- if (ret && !IS_ERR(id_priv->cm_id.ib)) {
- ib_destroy_cm_id(id_priv->cm_id.ib);
+ if (ret && !IS_ERR(id)) {
+ ib_destroy_cm_id(id);
id_priv->cm_id.ib = NULL;
}
@@ -2617,11 +2857,9 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
struct iw_cm_conn_param iw_param;
cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock,
- cma_iw_handler, id_priv);
- if (IS_ERR(cm_id)) {
- ret = PTR_ERR(cm_id);
- goto out;
- }
+ cma_iw_handler, id_priv);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
id_priv->cm_id.iw = cm_id;
@@ -2635,17 +2873,19 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
if (ret)
goto out;
+ if (conn_param) {
iw_param.ord = conn_param->initiator_depth;
iw_param.ird = conn_param->responder_resources;
iw_param.private_data = conn_param->private_data;
iw_param.private_data_len = conn_param->private_data_len;
- if (id_priv->id.qp)
+ iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
+ } else {
+ memset(&iw_param, 0, sizeof iw_param);
iw_param.qpn = id_priv->qp_num;
- else
- iw_param.qpn = conn_param->qp_num;
+ }
ret = iw_cm_connect(cm_id, &iw_param);
out:
- if (ret && !IS_ERR(cm_id)) {
+ if (ret) {
iw_destroy_cm_id(cm_id);
id_priv->cm_id.iw = NULL;
}
@@ -2658,7 +2898,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
+ if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
return -EINVAL;
if (!id->qp) {
@@ -2668,12 +2908,13 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
switch (rdma_node_get_transport(id->device->node_type)) {
case RDMA_TRANSPORT_IB:
- if (cma_is_ud_ps(id->ps))
+ if (id->qp_type == IB_QPT_UD)
ret = cma_resolve_ib_udp(id_priv, conn_param);
else
ret = cma_connect_ib(id_priv, conn_param);
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
ret = cma_connect_iw(id_priv, conn_param);
break;
default:
@@ -2685,7 +2926,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
return 0;
err:
- cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
+ cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
return ret;
}
EXPORT_SYMBOL(rdma_connect);
@@ -2713,9 +2954,9 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
rep.initiator_depth = conn_param->initiator_depth;
rep.failover_accepted = 0;
rep.flow_control = conn_param->flow_control;
- rep.rnr_retry_count = conn_param->rnr_retry_count;
+ rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
rep.srq = id_priv->srq ? 1 : 0;
-
+ cma_dbg(id_priv, "sending REP\n");
ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
out:
return ret;
@@ -2727,6 +2968,9 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
struct iw_cm_conn_param iw_param;
int ret;
+ if (!conn_param)
+ return -EINVAL;
+
ret = cma_modify_qp_rtr(id_priv, conn_param);
if (ret)
return ret;
@@ -2762,6 +3006,7 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
rep.private_data = private_data;
rep.private_data_len = private_data_len;
+ cma_dbg(id_priv, "sending SIDR\n");
return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
}
@@ -2771,7 +3016,9 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp(id_priv, CMA_CONNECT))
+
+ id_priv->owner = curthread->td_proc->p_pid;
+ if (!cma_comp(id_priv, RDMA_CM_CONNECT))
return -EINVAL;
if (!id->qp && conn_param) {
@@ -2781,16 +3028,23 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
switch (rdma_node_get_transport(id->device->node_type)) {
case RDMA_TRANSPORT_IB:
- if (cma_is_ud_ps(id->ps))
+ if (id->qp_type == IB_QPT_UD) {
+ if (conn_param)
ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
conn_param->private_data,
conn_param->private_data_len);
- else if (conn_param)
+ else
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ NULL, 0);
+ } else {
+ if (conn_param)
ret = cma_accept_ib(id_priv, conn_param);
else
ret = cma_rep_recv(id_priv);
+ }
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
ret = cma_accept_iw(id_priv, conn_param);
break;
default:
@@ -2815,7 +3069,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_has_cm_dev(id_priv))
+ if (!id_priv->cm_id.ib)
return -EINVAL;
switch (id->device->node_type) {
@@ -2837,20 +3091,23 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_has_cm_dev(id_priv))
+ if (!id_priv->cm_id.ib)
return -EINVAL;
switch (rdma_node_get_transport(id->device->node_type)) {
case RDMA_TRANSPORT_IB:
- if (cma_is_ud_ps(id->ps))
+ if (id->qp_type == IB_QPT_UD)
ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
private_data, private_data_len);
- else
+ else {
+ cma_dbg(id_priv, "sending REJ\n");
ret = ib_send_cm_rej(id_priv->cm_id.ib,
IB_CM_REJ_CONSUMER_DEFINED, NULL,
0, private_data, private_data_len);
+ }
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
ret = iw_cm_reject(id_priv->cm_id.iw,
private_data, private_data_len);
break;
@@ -2868,7 +3125,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_has_cm_dev(id_priv))
+ if (!id_priv->cm_id.ib)
return -EINVAL;
switch (rdma_node_get_transport(id->device->node_type)) {
@@ -2877,10 +3134,14 @@ int rdma_disconnect(struct rdma_cm_id *id)
if (ret)
goto out;
/* Initiate or respond to a disconnect. */
- if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+ cma_dbg(id_priv, "sending DREQ\n");
+ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) {
+ cma_dbg(id_priv, "sending DREP\n");
ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+ }
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
break;
default:
@@ -2897,35 +3158,55 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
struct rdma_id_private *id_priv;
struct cma_multicast *mc = multicast->context;
struct rdma_cm_event event;
+ struct rdma_dev_addr *dev_addr;
int ret;
+ struct net_device *ndev = NULL;
+ u16 vlan;
id_priv = mc->id_priv;
- if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
- cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
+ dev_addr = &id_priv->id.route.addr.dev_addr;
+ if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) &&
+ cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED))
return 0;
mutex_lock(&id_priv->qp_mutex);
if (!status && id_priv->id.qp)
status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
- multicast->rec.mlid);
+ be16_to_cpu(multicast->rec.mlid));
mutex_unlock(&id_priv->qp_mutex);
memset(&event, 0, sizeof event);
event.status = status;
event.param.ud.private_data = mc->context;
+ ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (!ndev) {
+ status = -ENODEV;
+ } else {
+ vlan = rdma_vlan_dev_vlan_id(ndev);
+ dev_put(ndev);
+ }
if (!status) {
event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
ib_init_ah_from_mcmember(id_priv->id.device,
id_priv->id.port_num, &multicast->rec,
&event.param.ud.ah_attr);
+ event.param.ud.ah_attr.vlan_id = vlan;
event.param.ud.qp_num = 0xFFFFFF;
event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
- } else
+ } else {
event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+ /* mark that the cached record is no longer valid */
+ if (status != -ENETRESET && status != -EAGAIN) {
+ spin_lock(&id_priv->lock);
+ id_priv->is_valid_rec = 0;
+ spin_unlock(&id_priv->lock);
+ }
+ }
+
ret = id_priv->id.event_handler(&id_priv->id, &event);
if (ret) {
- cma_exch(id_priv, CMA_DESTROYING);
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return 0;
@@ -2938,20 +3219,13 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
static void cma_set_mgid(struct rdma_id_private *id_priv,
struct sockaddr *addr, union ib_gid *mgid)
{
-#if defined(INET) || defined(INET6)
unsigned char mc_map[MAX_ADDR_LEN];
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
-#endif
-#ifdef INET
struct sockaddr_in *sin = (struct sockaddr_in *) addr;
-#endif
-#ifdef INET6
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
-#endif
if (cma_any_addr(addr)) {
memset(mgid, 0, sizeof *mgid);
-#ifdef INET6
} else if ((addr->sa_family == AF_INET6) &&
((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
0xFF10A01B)) {
@@ -2962,14 +3236,11 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
if (id_priv->id.ps == RDMA_PS_UDP)
mc_map[7] = 0x01; /* Use RDMA CM signature */
*mgid = *(union ib_gid *) (mc_map + 4);
-#endif
-#ifdef INET
} else {
ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
if (id_priv->id.ps == RDMA_PS_UDP)
mc_map[7] = 0x01; /* Use RDMA CM signature */
*mgid = *(union ib_gid *) (mc_map + 4);
-#endif
}
}
@@ -2979,13 +3250,26 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
struct ib_sa_mcmember_rec rec;
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
ib_sa_comp_mask comp_mask;
- int ret;
+ int ret = 0;
- ib_addr_get_mgid(dev_addr, &rec.mgid);
- ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
- &rec.mgid, &rec);
- if (ret)
+ ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid);
+
+ /* cache ipoib bc record */
+ spin_lock(&id_priv->lock);
+ if (!id_priv->is_valid_rec)
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+ id_priv->id.port_num,
+ &id_priv->rec.mgid,
+ &id_priv->rec);
+ if (ret) {
+ id_priv->is_valid_rec = 0;
+ spin_unlock(&id_priv->lock);
return ret;
+ } else {
+ rec = id_priv->rec;
+ id_priv->is_valid_rec = 1;
+ }
+ spin_unlock(&id_priv->lock);
cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
if (id_priv->id.ps == RDMA_PS_UDP)
@@ -3002,19 +3286,18 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
if (id_priv->id.ps == RDMA_PS_IPOIB)
comp_mask |= IB_SA_MCMEMBER_REC_RATE |
- IB_SA_MCMEMBER_REC_RATE_SELECTOR;
+ IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU |
+ IB_SA_MCMEMBER_REC_HOP_LIMIT;
mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
id_priv->id.port_num, &rec,
comp_mask, GFP_KERNEL,
cma_ib_mc_handler, mc);
- if (IS_ERR(mc->multicast.ib))
- return PTR_ERR(mc->multicast.ib);
-
- return 0;
+ return PTR_RET(mc->multicast.ib);
}
-
static void iboe_mcast_work_handler(struct work_struct *work)
{
struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
@@ -3034,9 +3317,9 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
if (cma_any_addr(addr)) {
memset(mgid, 0, sizeof *mgid);
- } else if (addr->sa_family == AF_INET6)
+ } else if (addr->sa_family == AF_INET6) {
memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
- else {
+ } else {
mgid->raw[0] = 0xff;
mgid->raw[1] = 0x0e;
mgid->raw[2] = 0;
@@ -3087,20 +3370,16 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
err = -ENODEV;
goto out2;
}
-
mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
mc->multicast.ib->rec.hop_limit = 1;
-#ifdef __linux__
- mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
-#else
mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu);
-#endif
dev_put(ndev);
if (!mc->multicast.ib->rec.mtu) {
err = -EINVAL;
goto out2;
}
- iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &mc->multicast.ib->rec.port_gid);
work->id = id_priv;
work->mc = mc;
INIT_WORK(&work->work, iboe_mcast_work_handler);
@@ -3124,8 +3403,8 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
- !cma_comp(id_priv, CMA_ADDR_RESOLVED))
+ if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) &&
+ !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED))
return -EINVAL;
mc = kmalloc(sizeof *mc, GFP_KERNEL);
@@ -3165,7 +3444,6 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
spin_unlock_irq(&id_priv->lock);
kfree(mc);
}
-
return ret;
}
EXPORT_SYMBOL(rdma_join_multicast);
@@ -3185,7 +3463,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
if (id->qp)
ib_detach_mcast(id->qp,
&mc->multicast.ib->rec.mgid,
- mc->multicast.ib->rec.mlid);
+ be16_to_cpu(mc->multicast.ib->rec.mlid));
if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
switch (rdma_port_get_link_layer(id->device, id->port_num)) {
case IB_LINK_LAYER_INFINIBAND:
@@ -3213,17 +3491,10 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id
dev_addr = &id_priv->id.route.addr.dev_addr;
-#ifdef __linux__
- if ((dev_addr->bound_dev_if == ndev->ifindex) &&
- memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
- printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
- ndev->name, &id_priv->id);
-#else
if ((dev_addr->bound_dev_if == ndev->if_index) &&
memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) {
printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
ndev->if_xname, &id_priv->id);
-#endif
work = kzalloc(sizeof *work, GFP_KERNEL);
if (!work)
return -ENOMEM;
@@ -3246,7 +3517,8 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
struct rdma_id_private *id_priv;
int ret = NOTIFY_DONE;
-#ifdef __linux__
+/* BONDING related, commented out until the bonding is resolved */
+#if 0
if (dev_net(ndev) != &init_net)
return NOTIFY_DONE;
@@ -3255,10 +3527,9 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
return NOTIFY_DONE;
-#else
+#endif
if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
-#endif
mutex_lock(&lock);
list_for_each_entry(cma_dev, &dev_list, list)
@@ -3303,19 +3574,19 @@ static void cma_add_one(struct ib_device *device)
static int cma_remove_id_dev(struct rdma_id_private *id_priv)
{
struct rdma_cm_event event;
- enum cma_state state;
+ enum rdma_cm_state state;
int ret = 0;
/* Record that we want to remove the device */
- state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
- if (state == CMA_DESTROYING)
+ state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
+ if (state == RDMA_CM_DESTROYING)
return 0;
cma_cancel_operation(id_priv, state);
mutex_lock(&id_priv->handler_mutex);
/* Check for destruction from another callback. */
- if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
+ if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
goto out;
memset(&event, 0, sizeof event);
@@ -3370,22 +3641,18 @@ static void cma_remove_one(struct ib_device *device)
kfree(cma_dev);
}
-static int cma_init(void)
+static int __init cma_init(void)
{
- int ret;
-#if defined(INET)
- int low, high, remaining;
-
- get_random_bytes(&next_port, sizeof next_port);
- inet_get_local_port_range(&low, &high);
- remaining = (high - low) + 1;
- next_port = ((unsigned int) next_port % remaining) + low;
-#endif
+ int ret = -ENOMEM;
cma_wq = create_singlethread_workqueue("rdma_cm");
if (!cma_wq)
return -ENOMEM;
+ cma_free_wq = create_singlethread_workqueue("rdma_cm_fr");
+ if (!cma_free_wq)
+ goto err1;
+
ib_sa_register_client(&sa_client);
rdma_addr_register_client(&addr_client);
register_netdevice_notifier(&cma_nb);
@@ -3393,27 +3660,34 @@ static int cma_init(void)
ret = ib_register_client(&cma_client);
if (ret)
goto err;
+
return 0;
err:
unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
+
+ destroy_workqueue(cma_free_wq);
+err1:
destroy_workqueue(cma_wq);
return ret;
}
-static void cma_cleanup(void)
+static void __exit cma_cleanup(void)
{
ib_unregister_client(&cma_client);
unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
+ flush_workqueue(cma_free_wq);
+ destroy_workqueue(cma_free_wq);
destroy_workqueue(cma_wq);
idr_destroy(&sdp_ps);
idr_destroy(&tcp_ps);
idr_destroy(&udp_ps);
idr_destroy(&ipoib_ps);
+ idr_destroy(&ib_ps);
}
module_init(cma_init);
diff --git a/sys/ofed/drivers/infiniband/core/core_priv.h b/sys/ofed/drivers/infiniband/core/core_priv.h
index 08c4bbb..001bbbe 100644
--- a/sys/ofed/drivers/infiniband/core/core_priv.h
+++ b/sys/ofed/drivers/infiniband/core/core_priv.h
@@ -38,7 +38,8 @@
#include <rdma/ib_verbs.h>
-int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *,
+int ib_device_register_sysfs(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
u8, struct kobject *));
void ib_device_unregister_sysfs(struct ib_device *device);
diff --git a/sys/ofed/drivers/infiniband/core/device.c b/sys/ofed/drivers/infiniband/core/device.c
index 98adf48..a7a06d78 100644
--- a/sys/ofed/drivers/infiniband/core/device.c
+++ b/sys/ofed/drivers/infiniband/core/device.c
@@ -37,7 +37,6 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include "core_priv.h"
@@ -45,18 +44,15 @@ MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("core kernel InfiniBand API");
MODULE_LICENSE("Dual BSD/GPL");
-#ifdef __ia64__
-/* workaround for a bug in hp chipset that would cause kernel
- panic when dma resources are exhaused */
-int dma_map_sg_hp_wa = 0;
-#endif
-
struct ib_client_data {
struct list_head list;
struct ib_client *client;
void * data;
};
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+
static LIST_HEAD(device_list);
static LIST_HEAD(client_list);
@@ -99,7 +95,7 @@ static int ib_device_check_mandatory(struct ib_device *device)
int i;
for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
- if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) {
+ if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
device->name, mandatory_table[i].name);
return -EINVAL;
@@ -177,9 +173,14 @@ static int end_port(struct ib_device *device)
*/
struct ib_device *ib_alloc_device(size_t size)
{
+ struct ib_device *dev;
+
BUG_ON(size < sizeof (struct ib_device));
- return kzalloc(size, GFP_KERNEL);
+ dev = kzalloc(size, GFP_KERNEL);
+ spin_lock_init(&dev->cmd_perf_lock);
+
+ return dev;
}
EXPORT_SYMBOL(ib_alloc_device);
@@ -295,8 +296,6 @@ int ib_register_device(struct ib_device *device,
INIT_LIST_HEAD(&device->client_data_list);
spin_lock_init(&device->event_handler_lock);
spin_lock_init(&device->client_data_lock);
- device->ib_uverbs_xrcd_table = RB_ROOT;
- mutex_init(&device->xrcd_table_mutex);
ret = read_port_table_lengths(device);
if (ret) {
@@ -631,6 +630,9 @@ int ib_modify_device(struct ib_device *device,
int device_modify_mask,
struct ib_device_modify *device_modify)
{
+ if (!device->modify_device)
+ return -ENOSYS;
+
return device->modify_device(device, device_modify_mask,
device_modify);
}
@@ -651,6 +653,9 @@ int ib_modify_port(struct ib_device *device,
u8 port_num, int port_modify_mask,
struct ib_port_modify *port_modify)
{
+ if (!device->modify_port)
+ return -ENOSYS;
+
if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
@@ -705,18 +710,28 @@ int ib_find_pkey(struct ib_device *device,
{
int ret, i;
u16 tmp_pkey;
+ int partial_ix = -1;
for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
-
if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
- *index = i;
- return 0;
+ /* if there is full-member pkey take it.*/
+ if (tmp_pkey & 0x8000) {
+ *index = i;
+ return 0;
+ }
+ if (partial_ix < 0)
+ partial_ix = i;
}
}
+ /*no full-member, if exists take the limited*/
+ if (partial_ix >= 0) {
+ *index = partial_ix;
+ return 0;
+ }
return -ENOENT;
}
EXPORT_SYMBOL(ib_find_pkey);
@@ -725,21 +740,29 @@ static int __init ib_core_init(void)
{
int ret;
-#ifdef __ia64__
- if (ia64_platform_is("hpzx1"))
- dma_map_sg_hp_wa = 1;
-#endif
+ ib_wq = create_workqueue("infiniband");
+ if (!ib_wq)
+ return -ENOMEM;
ret = ib_sysfs_setup();
- if (ret)
+ if (ret) {
printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+ goto err;
+ }
ret = ib_cache_setup();
if (ret) {
printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
- ib_sysfs_cleanup();
+ goto err_sysfs;
}
+ return 0;
+
+err_sysfs:
+ ib_sysfs_cleanup();
+
+err:
+ destroy_workqueue(ib_wq);
return ret;
}
@@ -748,7 +771,7 @@ static void __exit ib_core_cleanup(void)
ib_cache_cleanup();
ib_sysfs_cleanup();
/* Make sure that any pending umem accounting work is done. */
- flush_scheduled_work();
+ destroy_workqueue(ib_wq);
}
module_init(ib_core_init);
diff --git a/sys/ofed/drivers/infiniband/core/fmr_pool.c b/sys/ofed/drivers/infiniband/core/fmr_pool.c
index c225833..bda7abc 100644
--- a/sys/ofed/drivers/infiniband/core/fmr_pool.c
+++ b/sys/ofed/drivers/infiniband/core/fmr_pool.c
@@ -33,6 +33,7 @@
#include <linux/errno.h>
#include <linux/spinlock.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <linux/kthread.h>
@@ -150,7 +151,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
#ifdef DEBUG
if (fmr->ref_count !=0) {
- printk(KERN_WARNING PFX "Unmapping FMR %p with ref count %d\n",
+ printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n",
fmr, fmr->ref_count);
}
#endif
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c
index 27878a8..14d23cc 100644
--- a/sys/ofed/drivers/infiniband/core/iwcm.c
+++ b/sys/ofed/drivers/infiniband/core/iwcm.c
@@ -40,9 +40,12 @@
#include <linux/idr.h>
#include <linux/interrupt.h>
#include <linux/rbtree.h>
+#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/module.h>
#include <linux/string.h>
#include <rdma/iw_cm.h>
@@ -507,6 +510,8 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
if (!qp) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
return -EINVAL;
}
cm_id->device->iwcm->add_ref(qp);
@@ -566,6 +571,8 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
if (!qp) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
return -EINVAL;
}
cm_id->device->iwcm->add_ref(qp);
@@ -620,17 +627,6 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
*/
BUG_ON(iw_event->status);
- /*
- * We could be destroying the listening id. If so, ignore this
- * upcall.
- */
- spin_lock_irqsave(&listen_id_priv->lock, flags);
- if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
- spin_unlock_irqrestore(&listen_id_priv->lock, flags);
- goto out;
- }
- spin_unlock_irqrestore(&listen_id_priv->lock, flags);
-
cm_id = iw_create_cm_id(listen_id_priv->id.device,
iw_event->so,
listen_id_priv->id.cm_handler,
@@ -646,6 +642,19 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+ /*
+ * We could be destroying the listening id. If so, ignore this
+ * upcall.
+ */
+ spin_lock_irqsave(&listen_id_priv->lock, flags);
+ if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
ret = alloc_work_entries(cm_id_priv, 3);
if (ret) {
iw_cm_reject(cm_id, NULL, 0);
@@ -723,7 +732,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
*/
clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
- if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) {
+ if (iw_event->status == 0) {
cm_id_priv->id.local_addr = iw_event->local_addr;
cm_id_priv->id.remote_addr = iw_event->remote_addr;
cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
diff --git a/sys/ofed/drivers/infiniband/core/local_sa.c b/sys/ofed/drivers/infiniband/core/local_sa.c
deleted file mode 100644
index 9b9c60a..0000000
--- a/sys/ofed/drivers/infiniband/core/local_sa.c
+++ /dev/null
@@ -1,1273 +0,0 @@
-/*
- * Copyright (c) 2006 Intel Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/dma-mapping.h>
-#include <linux/err.h>
-#include <linux/interrupt.h>
-#include <linux/rbtree.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/miscdevice.h>
-#include <linux/random.h>
-
-#include <rdma/ib_cache.h>
-#include <rdma/ib_sa.h>
-#include "sa.h"
-
-MODULE_AUTHOR("Sean Hefty");
-MODULE_DESCRIPTION("InfiniBand subnet administration caching");
-MODULE_LICENSE("Dual BSD/GPL");
-
-enum {
- SA_DB_MAX_PATHS_PER_DEST = 0x7F,
- SA_DB_MIN_RETRY_TIMER = 4000, /* 4 sec */
- SA_DB_MAX_RETRY_TIMER = 256000 /* 256 sec */
-};
-
-static int set_paths_per_dest(const char *val, struct kernel_param *kp);
-static unsigned long paths_per_dest = 0;
-module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
- &paths_per_dest, 0644);
-MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
- "to each destination (DGID). Set to 0 "
- "to disable cache.");
-
-static int set_subscribe_inform_info(const char *val, struct kernel_param *kp);
-static char subscribe_inform_info = 1;
-module_param_call(subscribe_inform_info, set_subscribe_inform_info,
- param_get_bool, &subscribe_inform_info, 0644);
-MODULE_PARM_DESC(subscribe_inform_info,
- "Subscribe for SA InformInfo/Notice events.");
-
-static int do_refresh(const char *val, struct kernel_param *kp);
-module_param_call(refresh, do_refresh, NULL, NULL, 0200);
-
-static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
-
-enum sa_db_lookup_method {
- SA_DB_LOOKUP_LEAST_USED,
- SA_DB_LOOKUP_RANDOM
-};
-
-static int set_lookup_method(const char *val, struct kernel_param *kp);
-static int get_lookup_method(char *buf, struct kernel_param *kp);
-static unsigned long lookup_method;
-module_param_call(lookup_method, set_lookup_method, get_lookup_method,
- &lookup_method, 0644);
-MODULE_PARM_DESC(lookup_method, "Method used to return path records when "
- "multiple paths exist to a given destination.");
-
-static void sa_db_add_dev(struct ib_device *device);
-static void sa_db_remove_dev(struct ib_device *device);
-
-static struct ib_client sa_db_client = {
- .name = "local_sa",
- .add = sa_db_add_dev,
- .remove = sa_db_remove_dev
-};
-
-static LIST_HEAD(dev_list);
-static DEFINE_MUTEX(lock);
-static rwlock_t rwlock;
-static struct workqueue_struct *sa_wq;
-static struct ib_sa_client sa_client;
-
-enum sa_db_state {
- SA_DB_IDLE,
- SA_DB_REFRESH,
- SA_DB_DESTROY
-};
-
-struct sa_db_port {
- struct sa_db_device *dev;
- struct ib_mad_agent *agent;
- /* Limit number of outstanding MADs to SA to reduce SA flooding */
- struct ib_mad_send_buf *msg;
- u16 sm_lid;
- u8 sm_sl;
- struct ib_inform_info *in_info;
- struct ib_inform_info *out_info;
- struct rb_root paths;
- struct list_head update_list;
- unsigned long update_id;
- enum sa_db_state state;
- struct work_struct work;
- union ib_gid gid;
- int port_num;
-};
-
-struct sa_db_device {
- struct list_head list;
- struct ib_device *device;
- struct ib_event_handler event_handler;
- int start_port;
- int port_count;
- struct sa_db_port port[0];
-};
-
-struct ib_sa_iterator {
- struct ib_sa_iterator *next;
-};
-
-struct ib_sa_attr_iter {
- struct ib_sa_iterator *iter;
- unsigned long flags;
-};
-
-struct ib_sa_attr_list {
- struct ib_sa_iterator iter;
- struct ib_sa_iterator *tail;
- int update_id;
- union ib_gid gid;
- struct rb_node node;
-};
-
-struct ib_path_rec_info {
- struct ib_sa_iterator iter; /* keep first */
- struct ib_sa_path_rec rec;
- unsigned long lookups;
-};
-
-struct ib_sa_mad_iter {
- struct ib_mad_recv_wc *recv_wc;
- struct ib_mad_recv_buf *recv_buf;
- int attr_size;
- int attr_offset;
- int data_offset;
- int data_left;
- void *attr;
- u8 attr_data[0];
-};
-
-enum sa_update_type {
- SA_UPDATE_FULL,
- SA_UPDATE_ADD,
- SA_UPDATE_REMOVE
-};
-
-struct update_info {
- struct list_head list;
- union ib_gid gid;
- enum sa_update_type type;
-};
-
-struct sa_path_request {
- struct work_struct work;
- struct ib_sa_client *client;
- void (*callback)(int, struct ib_sa_path_rec *, void *);
- void *context;
- struct ib_sa_path_rec path_rec;
-};
-
-static void process_updates(struct sa_db_port *port);
-
-static void free_attr_list(struct ib_sa_attr_list *attr_list)
-{
- struct ib_sa_iterator *cur;
-
- for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) {
- attr_list->iter.next = cur->next;
- kfree(cur);
- }
- attr_list->tail = &attr_list->iter;
-}
-
-static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list)
-{
- rb_erase(&attr_list->node, root);
- free_attr_list(attr_list);
- kfree(attr_list);
-}
-
-static void remove_all_attrs(struct rb_root *root)
-{
- struct rb_node *node, *next_node;
- struct ib_sa_attr_list *attr_list;
-
- write_lock_irq(&rwlock);
- for (node = rb_first(root); node; node = next_node) {
- next_node = rb_next(node);
- attr_list = rb_entry(node, struct ib_sa_attr_list, node);
- remove_attr(root, attr_list);
- }
- write_unlock_irq(&rwlock);
-}
-
-static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
-{
- struct rb_node *node, *next_node;
- struct ib_sa_attr_list *attr_list;
-
- write_lock_irq(&rwlock);
- for (node = rb_first(root); node; node = next_node) {
- next_node = rb_next(node);
- attr_list = rb_entry(node, struct ib_sa_attr_list, node);
- if (attr_list->update_id != update_id)
- remove_attr(root, attr_list);
- }
- write_unlock_irq(&rwlock);
-}
-
-static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
- struct ib_sa_attr_list *attr_list)
-{
- struct rb_node **link = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ib_sa_attr_list *cur_attr_list;
- int cmp;
-
- while (*link) {
- parent = *link;
- cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node);
- cmp = memcmp(&cur_attr_list->gid, &attr_list->gid,
- sizeof attr_list->gid);
- if (cmp < 0)
- link = &(*link)->rb_left;
- else if (cmp > 0)
- link = &(*link)->rb_right;
- else
- return cur_attr_list;
- }
- rb_link_node(&attr_list->node, parent, link);
- rb_insert_color(&attr_list->node, root);
- return NULL;
-}
-
-static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
-{
- struct rb_node *node = root->rb_node;
- struct ib_sa_attr_list *attr_list;
- int cmp;
-
- while (node) {
- attr_list = rb_entry(node, struct ib_sa_attr_list, node);
- cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid);
- if (cmp < 0)
- node = node->rb_left;
- else if (cmp > 0)
- node = node->rb_right;
- else
- return attr_list;
- }
- return NULL;
-}
-
-static int insert_attr(struct rb_root *root, unsigned long update_id, void *key,
- struct ib_sa_iterator *iter)
-{
- struct ib_sa_attr_list *attr_list;
- void *err;
-
- write_lock_irq(&rwlock);
- attr_list = find_attr_list(root, key);
- if (!attr_list) {
- write_unlock_irq(&rwlock);
- attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
- if (!attr_list)
- return -ENOMEM;
-
- attr_list->iter.next = NULL;
- attr_list->tail = &attr_list->iter;
- attr_list->update_id = update_id;
- memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);
-
- write_lock_irq(&rwlock);
- err = insert_attr_list(root, attr_list);
- if (err) {
- write_unlock_irq(&rwlock);
- kfree(attr_list);
- return PTR_ERR(err);
- }
- } else if (attr_list->update_id != update_id) {
- free_attr_list(attr_list);
- attr_list->update_id = update_id;
- }
-
- attr_list->tail->next = iter;
- iter->next = NULL;
- attr_list->tail = iter;
- write_unlock_irq(&rwlock);
- return 0;
-}
-
-static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
-{
- struct ib_sa_mad_iter *iter;
- struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
- int attr_size, attr_offset;
-
- attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
- attr_size = 64; /* path record length */
- if (attr_offset < attr_size)
- return ERR_PTR(-EINVAL);
-
- iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL);
- if (!iter)
- return ERR_PTR(-ENOMEM);
-
- iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR;
- iter->recv_wc = mad_recv_wc;
- iter->recv_buf = &mad_recv_wc->recv_buf;
- iter->attr_offset = attr_offset;
- iter->attr_size = attr_size;
- return iter;
-}
-
-static void ib_sa_iter_free(struct ib_sa_mad_iter *iter)
-{
- kfree(iter);
-}
-
-static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter)
-{
- struct ib_sa_mad *mad;
- int left, offset = 0;
-
- while (iter->data_left >= iter->attr_offset) {
- while (iter->data_offset < IB_MGMT_SA_DATA) {
- mad = (struct ib_sa_mad *) iter->recv_buf->mad;
-
- left = IB_MGMT_SA_DATA - iter->data_offset;
- if (left < iter->attr_size) {
- /* copy first piece of the attribute */
- iter->attr = &iter->attr_data;
- memcpy(iter->attr,
- &mad->data[iter->data_offset], left);
- offset = left;
- break;
- } else if (offset) {
- /* copy the second piece of the attribute */
- memcpy(iter->attr + offset, &mad->data[0],
- iter->attr_size - offset);
- iter->data_offset = iter->attr_size - offset;
- offset = 0;
- } else {
- iter->attr = &mad->data[iter->data_offset];
- iter->data_offset += iter->attr_size;
- }
-
- iter->data_left -= iter->attr_offset;
- goto out;
- }
- iter->data_offset = 0;
- iter->recv_buf = list_entry(iter->recv_buf->list.next,
- struct ib_mad_recv_buf, list);
- }
- iter->attr = NULL;
-out:
- return iter->attr;
-}
-
-/*
- * Copy path records from a received response and insert them into our cache.
- * A path record in the MADs are in network order, packed, and may
- * span multiple MAD buffers, just to make our life hard.
- */
-static void update_path_db(struct sa_db_port *port,
- struct ib_mad_recv_wc *mad_recv_wc,
- enum sa_update_type type)
-{
- struct ib_sa_mad_iter *iter;
- struct ib_path_rec_info *path_info;
- void *attr;
- int ret;
-
- iter = ib_sa_iter_create(mad_recv_wc);
- if (IS_ERR(iter))
- return;
-
- port->update_id += (type == SA_UPDATE_FULL);
-
- while ((attr = ib_sa_iter_next(iter)) &&
- (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {
-
- ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
-
- ret = insert_attr(&port->paths, port->update_id,
- path_info->rec.dgid.raw, &path_info->iter);
- if (ret) {
- kfree(path_info);
- break;
- }
- }
- ib_sa_iter_free(iter);
-
- if (type == SA_UPDATE_FULL)
- remove_old_attrs(&port->paths, port->update_id);
-}
-
-static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
- struct update_info *update)
-{
- struct ib_ah_attr ah_attr;
- struct ib_mad_send_buf *msg;
-
- msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
- IB_MGMT_SA_DATA, GFP_KERNEL);
- if (IS_ERR(msg))
- return NULL;
-
- memset(&ah_attr, 0, sizeof ah_attr);
- ah_attr.dlid = port->sm_lid;
- ah_attr.sl = port->sm_sl;
- ah_attr.port_num = port->port_num;
-
- msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
- if (IS_ERR(msg->ah)) {
- ib_free_send_mad(msg);
- return NULL;
- }
-
- msg->timeout_ms = retry_timer;
- msg->retries = 0;
- msg->context[0] = port;
- msg->context[1] = update;
- return msg;
-}
-
-static __be64 form_tid(u32 hi_tid)
-{
- static atomic_t tid;
- return cpu_to_be64((((u64) hi_tid) << 32) |
- ((u32) atomic_inc_return(&tid)));
-}
-
-static void format_path_req(struct sa_db_port *port,
- struct update_info *update,
- struct ib_mad_send_buf *msg)
-{
- struct ib_sa_mad *mad = msg->mad;
- struct ib_sa_path_rec path_rec;
-
- mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
- mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
- mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
- mad->mad_hdr.method = IB_SA_METHOD_GET_TABLE;
- mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
- mad->mad_hdr.tid = form_tid(msg->mad_agent->hi_tid);
-
- mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;
-
- path_rec.sgid = port->gid;
- path_rec.numb_path = (u8) paths_per_dest;
-
- if (update->type == SA_UPDATE_ADD) {
- mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
- memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
- }
-
- ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
-}
-
-static int send_query(struct sa_db_port *port,
- struct update_info *update)
-{
- int ret;
-
- port->msg = get_sa_msg(port, update);
- if (!port->msg)
- return -ENOMEM;
-
- format_path_req(port, update, port->msg);
-
- ret = ib_post_send_mad(port->msg, NULL);
- if (ret)
- goto err;
-
- return 0;
-
-err:
- ib_destroy_ah(port->msg->ah);
- ib_free_send_mad(port->msg);
- return ret;
-}
-
-static void add_update(struct sa_db_port *port, u8 *gid,
- enum sa_update_type type)
-{
- struct update_info *update;
-
- update = kmalloc(sizeof *update, GFP_KERNEL);
- if (update) {
- if (gid)
- memcpy(&update->gid, gid, sizeof update->gid);
- update->type = type;
- list_add(&update->list, &port->update_list);
- }
-
- if (port->state == SA_DB_IDLE) {
- port->state = SA_DB_REFRESH;
- process_updates(port);
- }
-}
-
-static void clean_update_list(struct sa_db_port *port)
-{
- struct update_info *update;
-
- while (!list_empty(&port->update_list)) {
- update = list_entry(port->update_list.next,
- struct update_info, list);
- list_del(&update->list);
- kfree(update);
- }
-}
-
-static int notice_handler(int status, struct ib_inform_info *info,
- struct ib_sa_notice *notice)
-{
- struct sa_db_port *port = info->context;
- struct ib_sa_notice_data_gid *gid_data;
- struct ib_inform_info **pinfo;
- enum sa_update_type type;
-
- if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
- pinfo = &port->in_info;
- type = SA_UPDATE_ADD;
- } else {
- pinfo = &port->out_info;
- type = SA_UPDATE_REMOVE;
- }
-
- mutex_lock(&lock);
- if (port->state == SA_DB_DESTROY || !*pinfo) {
- mutex_unlock(&lock);
- return 0;
- }
-
- if (notice) {
- gid_data = (struct ib_sa_notice_data_gid *)
- &notice->data_details;
- add_update(port, gid_data->gid, type);
- mutex_unlock(&lock);
- } else if (status == -ENETRESET) {
- *pinfo = NULL;
- mutex_unlock(&lock);
- } else {
- if (status)
- *pinfo = ERR_PTR(-EINVAL);
- port->state = SA_DB_IDLE;
- clean_update_list(port);
- mutex_unlock(&lock);
- queue_work(sa_wq, &port->work);
- }
-
- return status;
-}
-
-static int reg_in_info(struct sa_db_port *port)
-{
- int ret = 0;
-
- port->in_info = ib_sa_register_inform_info(&sa_client,
- port->dev->device,
- port->port_num,
- IB_SA_SM_TRAP_GID_IN_SERVICE,
- GFP_KERNEL, notice_handler,
- port);
- if (IS_ERR(port->in_info))
- ret = PTR_ERR(port->in_info);
-
- return ret;
-}
-
-static int reg_out_info(struct sa_db_port *port)
-{
- int ret = 0;
-
- port->out_info = ib_sa_register_inform_info(&sa_client,
- port->dev->device,
- port->port_num,
- IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
- GFP_KERNEL, notice_handler,
- port);
- if (IS_ERR(port->out_info))
- ret = PTR_ERR(port->out_info);
-
- return ret;
-}
-
-static void unsubscribe_port(struct sa_db_port *port)
-{
- if (port->in_info && !IS_ERR(port->in_info))
- ib_sa_unregister_inform_info(port->in_info);
-
- if (port->out_info && !IS_ERR(port->out_info))
- ib_sa_unregister_inform_info(port->out_info);
-
- port->out_info = NULL;
- port->in_info = NULL;
-
-}
-
-static void cleanup_port(struct sa_db_port *port)
-{
- unsubscribe_port(port);
-
- clean_update_list(port);
- remove_all_attrs(&port->paths);
-}
-
-static int update_port_info(struct sa_db_port *port)
-{
- struct ib_port_attr port_attr;
- int ret;
-
- ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
- if (ret)
- return ret;
-
- if (port_attr.state != IB_PORT_ACTIVE)
- return -ENODATA;
-
- port->sm_lid = port_attr.sm_lid;
- port->sm_sl = port_attr.sm_sl;
- return 0;
-}
-
-static void process_updates(struct sa_db_port *port)
-{
- struct update_info *update;
- struct ib_sa_attr_list *attr_list;
- int ret;
-
- if (!paths_per_dest || update_port_info(port)) {
- cleanup_port(port);
- goto out;
- }
-
- /* Event registration is an optimization, so ignore failures. */
- if (subscribe_inform_info) {
- if (!port->out_info) {
- ret = reg_out_info(port);
- if (!ret)
- return;
- }
-
- if (!port->in_info) {
- ret = reg_in_info(port);
- if (!ret)
- return;
- }
- } else
- unsubscribe_port(port);
-
- while (!list_empty(&port->update_list)) {
- update = list_entry(port->update_list.next,
- struct update_info, list);
-
- if (update->type == SA_UPDATE_REMOVE) {
- write_lock_irq(&rwlock);
- attr_list = find_attr_list(&port->paths,
- update->gid.raw);
- if (attr_list)
- remove_attr(&port->paths, attr_list);
- write_unlock_irq(&rwlock);
- } else {
- ret = send_query(port, update);
- if (!ret)
- return;
-
- }
- list_del(&update->list);
- kfree(update);
- }
-out:
- port->state = SA_DB_IDLE;
-}
-
-static void refresh_port_db(struct sa_db_port *port)
-{
- if (port->state == SA_DB_DESTROY)
- return;
-
- if (port->state == SA_DB_REFRESH) {
- clean_update_list(port);
- ib_cancel_mad(port->agent, port->msg);
- }
-
- add_update(port, NULL, SA_UPDATE_FULL);
-}
-
-static void refresh_dev_db(struct sa_db_device *dev)
-{
- int i;
-
- for (i = 0; i < dev->port_count; i++)
- refresh_port_db(&dev->port[i]);
-}
-
-static void refresh_db(void)
-{
- struct sa_db_device *dev;
-
- list_for_each_entry(dev, &dev_list, list)
- refresh_dev_db(dev);
-}
-
-static int do_refresh(const char *val, struct kernel_param *kp)
-{
- mutex_lock(&lock);
- refresh_db();
- mutex_unlock(&lock);
- return 0;
-}
-
-static int get_lookup_method(char *buf, struct kernel_param *kp)
-{
- return sprintf(buf,
- "%c %d round robin\n"
- "%c %d random",
- (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ',
- SA_DB_LOOKUP_LEAST_USED,
- (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ',
- SA_DB_LOOKUP_RANDOM);
-}
-
-static int set_lookup_method(const char *val, struct kernel_param *kp)
-{
- unsigned long method;
- int ret = 0;
-
- method = simple_strtoul(val, NULL, 0);
-
- switch (method) {
- case SA_DB_LOOKUP_LEAST_USED:
- case SA_DB_LOOKUP_RANDOM:
- lookup_method = method;
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-static int set_paths_per_dest(const char *val, struct kernel_param *kp)
-{
- int ret;
-
- mutex_lock(&lock);
- ret = param_set_ulong(val, kp);
- if (ret)
- goto out;
-
- if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST)
- paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
- refresh_db();
-out:
- mutex_unlock(&lock);
- return ret;
-}
-
-static int set_subscribe_inform_info(const char *val, struct kernel_param *kp)
-{
- int ret;
-
- ret = param_set_bool(val, kp);
- if (ret)
- return ret;
-
- return do_refresh(val, kp);
-}
-
-static void port_work_handler(struct work_struct *work)
-{
- struct sa_db_port *port;
-
- port = container_of(work, typeof(*port), work);
- mutex_lock(&lock);
- refresh_port_db(port);
- mutex_unlock(&lock);
-}
-
-static void handle_event(struct ib_event_handler *event_handler,
- struct ib_event *event)
-{
- struct sa_db_device *dev;
- struct sa_db_port *port;
-
- dev = container_of(event_handler, typeof(*dev), event_handler);
- port = &dev->port[event->element.port_num - dev->start_port];
-
- switch (event->event) {
- case IB_EVENT_PORT_ERR:
- case IB_EVENT_LID_CHANGE:
- case IB_EVENT_SM_CHANGE:
- case IB_EVENT_CLIENT_REREGISTER:
- case IB_EVENT_PKEY_CHANGE:
- case IB_EVENT_PORT_ACTIVE:
- queue_work(sa_wq, &port->work);
- break;
- default:
- break;
- }
-}
-
-static void ib_free_path_iter(struct ib_sa_attr_iter *iter)
-{
- read_unlock_irqrestore(&rwlock, iter->flags);
-}
-
-static int ib_create_path_iter(struct ib_device *device, u8 port_num,
- union ib_gid *dgid, struct ib_sa_attr_iter *iter)
-{
- struct sa_db_device *dev;
- struct sa_db_port *port;
- struct ib_sa_attr_list *list;
-
- dev = ib_get_client_data(device, &sa_db_client);
- if (!dev)
- return -ENODEV;
-
- port = &dev->port[port_num - dev->start_port];
-
- read_lock_irqsave(&rwlock, iter->flags);
- list = find_attr_list(&port->paths, dgid->raw);
- if (!list) {
- ib_free_path_iter(iter);
- return -ENODATA;
- }
-
- iter->iter = &list->iter;
- return 0;
-}
-
-static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter)
-{
- struct ib_path_rec_info *next_path;
-
- iter->iter = iter->iter->next;
- if (iter->iter) {
- next_path = container_of(iter->iter, struct ib_path_rec_info, iter);
- return &next_path->rec;
- } else
- return NULL;
-}
-
-static int cmp_rec(struct ib_sa_path_rec *src,
- struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask)
-{
- /* DGID check already done */
- if (comp_mask & IB_SA_PATH_REC_SGID &&
- memcmp(&src->sgid, &dst->sgid, sizeof src->sgid))
- return -EINVAL;
- if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid)
- return -EINVAL;
- if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid)
- return -EINVAL;
- if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC &&
- src->raw_traffic != dst->raw_traffic)
- return -EINVAL;
-
- if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL &&
- src->flow_label != dst->flow_label)
- return -EINVAL;
- if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT &&
- src->hop_limit != dst->hop_limit)
- return -EINVAL;
- if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS &&
- src->traffic_class != dst->traffic_class)
- return -EINVAL;
- if (comp_mask & IB_SA_PATH_REC_REVERSIBLE &&
- dst->reversible && !src->reversible)
- return -EINVAL;
- /* Numb path check already done */
- if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey)
- return -EINVAL;
-
- if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl)
- return -EINVAL;
-
- if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR,
- IB_SA_PATH_REC_MTU, dst->mtu_selector,
- src->mtu, dst->mtu))
- return -EINVAL;
- if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR,
- IB_SA_PATH_REC_RATE, dst->rate_selector,
- src->rate, dst->rate))
- return -EINVAL;
- if (ib_sa_check_selector(comp_mask,
- IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR,
- IB_SA_PATH_REC_PACKET_LIFE_TIME,
- dst->packet_life_time_selector,
- src->packet_life_time, dst->packet_life_time))
- return -EINVAL;
-
- return 0;
-}
-
-static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter,
- struct ib_sa_path_rec *req_path,
- ib_sa_comp_mask comp_mask)
-{
- struct ib_sa_path_rec *path, *rand_path = NULL;
- int num, count = 0;
-
- for (path = ib_get_next_path(iter); path;
- path = ib_get_next_path(iter)) {
- if (!cmp_rec(path, req_path, comp_mask)) {
- get_random_bytes(&num, sizeof num);
- if ((num % ++count) == 0)
- rand_path = path;
- }
- }
-
- return rand_path;
-}
-
-static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
- struct ib_sa_path_rec *req_path,
- ib_sa_comp_mask comp_mask)
-{
- struct ib_path_rec_info *cur_path, *next_path = NULL;
- struct ib_sa_path_rec *path;
- unsigned long lookups = ~0;
-
- for (path = ib_get_next_path(iter); path;
- path = ib_get_next_path(iter)) {
- if (!cmp_rec(path, req_path, comp_mask)) {
-
- cur_path = container_of(iter->iter, struct ib_path_rec_info,
- iter);
- if (cur_path->lookups < lookups) {
- lookups = cur_path->lookups;
- next_path = cur_path;
- }
- }
- }
-
- if (next_path) {
- next_path->lookups++;
- return &next_path->rec;
- } else
- return NULL;
-}
-
-static void report_path(struct work_struct *work)
-{
- struct sa_path_request *req;
-
- req = container_of(work, struct sa_path_request, work);
- req->callback(0, &req->path_rec, req->context);
- ib_sa_client_put(req->client);
- kfree(req);
-}
-
-/**
- * ib_sa_path_rec_get - Start a Path get query
- * @client:SA client
- * @device:device to send query on
- * @port_num: port number to send query on
- * @rec:Path Record to send in query
- * @comp_mask:component mask to send in query
- * @timeout_ms:time to wait for response
- * @gfp_mask:GFP mask to use for internal allocations
- * @callback:function called when query completes, times out or is
- * canceled
- * @context:opaque user context passed to callback
- * @sa_query:query context, used to cancel query
- *
- * Send a Path Record Get query to the SA to look up a path. The
- * callback function will be called when the query completes (or
- * fails); status is 0 for a successful response, -EINTR if the query
- * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
- * occurred sending the query. The resp parameter of the callback is
- * only valid if status is 0.
- *
- * If the return value of ib_sa_path_rec_get() is negative, it is an
- * error code. Otherwise it is a query ID that can be used to cancel
- * the query.
- */
-int ib_sa_path_rec_get(struct ib_sa_client *client,
- struct ib_device *device, u8 port_num,
- struct ib_sa_path_rec *rec,
- ib_sa_comp_mask comp_mask,
- int timeout_ms, gfp_t gfp_mask,
- void (*callback)(int status,
- struct ib_sa_path_rec *resp,
- void *context),
- void *context,
- struct ib_sa_query **sa_query)
-{
- struct sa_path_request *req;
- struct ib_sa_attr_iter iter;
- struct ib_sa_path_rec *path_rec;
- int ret;
-
- if (!paths_per_dest)
- goto query_sa;
-
- if (!(comp_mask & IB_SA_PATH_REC_DGID) ||
- !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1)
- goto query_sa;
-
- req = kmalloc(sizeof *req, gfp_mask);
- if (!req)
- goto query_sa;
-
- ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter);
- if (ret)
- goto free_req;
-
- if (lookup_method == SA_DB_LOOKUP_RANDOM)
- path_rec = get_random_path(&iter, rec, comp_mask);
- else
- path_rec = get_next_path(&iter, rec, comp_mask);
-
- if (!path_rec)
- goto free_iter;
-
- memcpy(&req->path_rec, path_rec, sizeof *path_rec);
- ib_free_path_iter(&iter);
-
- INIT_WORK(&req->work, report_path);
- req->client = client;
- req->callback = callback;
- req->context = context;
-
- ib_sa_client_get(client);
- queue_work(sa_wq, &req->work);
- *sa_query = ERR_PTR(-EEXIST);
- return 0;
-
-free_iter:
- ib_free_path_iter(&iter);
-free_req:
- kfree(req);
-query_sa:
- return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask,
- timeout_ms, gfp_mask, callback, context,
- sa_query);
-}
-EXPORT_SYMBOL(ib_sa_path_rec_get);
-
-static void recv_handler(struct ib_mad_agent *mad_agent,
- struct ib_mad_recv_wc *mad_recv_wc)
-{
- struct sa_db_port *port;
- struct update_info *update;
- struct ib_mad_send_buf *msg;
- enum sa_update_type type;
-
- msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
- port = msg->context[0];
- update = msg->context[1];
-
- mutex_lock(&lock);
- if (port->state == SA_DB_DESTROY ||
- update != list_entry(port->update_list.next,
- struct update_info, list)) {
- mutex_unlock(&lock);
- } else {
- type = update->type;
- mutex_unlock(&lock);
- update_path_db(mad_agent->context, mad_recv_wc, type);
- }
-
- ib_free_recv_mad(mad_recv_wc);
-}
-
-static void send_handler(struct ib_mad_agent *agent,
- struct ib_mad_send_wc *mad_send_wc)
-{
- struct ib_mad_send_buf *msg;
- struct sa_db_port *port;
- struct update_info *update;
- int ret;
-
- msg = mad_send_wc->send_buf;
- port = msg->context[0];
- update = msg->context[1];
-
- mutex_lock(&lock);
- if (port->state == SA_DB_DESTROY)
- goto unlock;
-
- if (update == list_entry(port->update_list.next,
- struct update_info, list)) {
-
- if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
- msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
-
- msg->timeout_ms <<= 1;
- ret = ib_post_send_mad(msg, NULL);
- if (!ret) {
- mutex_unlock(&lock);
- return;
- }
- }
- list_del(&update->list);
- kfree(update);
- }
- process_updates(port);
-unlock:
- mutex_unlock(&lock);
-
- ib_destroy_ah(msg->ah);
- ib_free_send_mad(msg);
-}
-
-static int init_port(struct sa_db_device *dev, int port_num)
-{
- struct sa_db_port *port;
- int ret;
-
- port = &dev->port[port_num - dev->start_port];
- port->dev = dev;
- port->port_num = port_num;
- INIT_WORK(&port->work, port_work_handler);
- port->paths = RB_ROOT;
- INIT_LIST_HEAD(&port->update_list);
-
- ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
- if (ret)
- return ret;
-
- port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
- NULL, IB_MGMT_RMPP_VERSION,
- send_handler, recv_handler, port);
- if (IS_ERR(port->agent))
- ret = PTR_ERR(port->agent);
-
- return ret;
-}
-
-static void destroy_port(struct sa_db_port *port)
-{
- mutex_lock(&lock);
- port->state = SA_DB_DESTROY;
- mutex_unlock(&lock);
-
- ib_unregister_mad_agent(port->agent);
- cleanup_port(port);
- flush_workqueue(sa_wq);
-}
-
-static void sa_db_add_dev(struct ib_device *device)
-{
- struct sa_db_device *dev;
- struct sa_db_port *port;
- int s, e, i, ret;
-
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- s = e = 0;
- } else {
- s = 1;
- e = device->phys_port_cnt;
- }
-
- dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
- if (!dev)
- return;
-
- dev->start_port = s;
- dev->port_count = e - s + 1;
- dev->device = device;
- for (i = 0; i < dev->port_count; i++) {
- ret = init_port(dev, s + i);
- if (ret)
- goto err;
- }
-
- ib_set_client_data(device, &sa_db_client, dev);
-
- INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
-
- mutex_lock(&lock);
- list_add_tail(&dev->list, &dev_list);
- refresh_dev_db(dev);
- mutex_unlock(&lock);
-
- ib_register_event_handler(&dev->event_handler);
- return;
-err:
- while (i--)
- destroy_port(&dev->port[i]);
- kfree(dev);
-}
-
-static void sa_db_remove_dev(struct ib_device *device)
-{
- struct sa_db_device *dev;
- int i;
-
- dev = ib_get_client_data(device, &sa_db_client);
- if (!dev)
- return;
-
- ib_unregister_event_handler(&dev->event_handler);
- flush_workqueue(sa_wq);
-
- for (i = 0; i < dev->port_count; i++)
- destroy_port(&dev->port[i]);
-
- mutex_lock(&lock);
- list_del(&dev->list);
- mutex_unlock(&lock);
-
- kfree(dev);
-}
-
-int sa_db_init(void)
-{
- int ret;
-
- rwlock_init(&rwlock);
- sa_wq = create_singlethread_workqueue("local_sa");
- if (!sa_wq)
- return -ENOMEM;
-
- ib_sa_register_client(&sa_client);
- ret = ib_register_client(&sa_db_client);
- if (ret)
- goto err;
-
- return 0;
-
-err:
- ib_sa_unregister_client(&sa_client);
- destroy_workqueue(sa_wq);
- return ret;
-}
-
-void sa_db_cleanup(void)
-{
- ib_unregister_client(&sa_db_client);
- ib_sa_unregister_client(&sa_client);
- destroy_workqueue(sa_wq);
-}
diff --git a/sys/ofed/drivers/infiniband/core/mad.c b/sys/ofed/drivers/infiniband/core/mad.c
index 64e660c..11b3ba3 100644
--- a/sys/ofed/drivers/infiniband/core/mad.c
+++ b/sys/ofed/drivers/infiniband/core/mad.c
@@ -34,6 +34,9 @@
*
*/
#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
#include <rdma/ib_cache.h>
#include "mad_priv.h"
@@ -46,8 +49,8 @@ MODULE_DESCRIPTION("kernel IB MAD API");
MODULE_AUTHOR("Hal Rosenstock");
MODULE_AUTHOR("Sean Hefty");
-int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
-int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
module_param_named(send_queue_size, mad_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
@@ -59,9 +62,26 @@ static struct kmem_cache *ib_mad_cache;
static struct list_head ib_mad_port_list;
static u32 ib_mad_client_id = 0;
-/* Port list lock */
-static spinlock_t ib_mad_port_list_lock;
+/*
+ * Timeout FIFO (tf) param
+ */
+enum {
+ /* min time between 2 consecutive activations of tf workqueue */
+ MIN_BETWEEN_ACTIVATIONS_MS = 5
+};
+
+/*
+ * SA congestion control params
+ */
+enum {
+ MAX_OUTSTANDING_SA_MADS = 10,
+ MIN_TIME_FOR_SA_MAD_SEND_MS = 20,
+ MAX_SA_MADS = 10000
+};
+
+/* Port list lock */
+static DEFINE_SPINLOCK(ib_mad_port_list_lock);
/* Forward declarations */
static int method_in_use(struct ib_mad_mgmt_method_table **method,
@@ -80,6 +100,509 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
u8 mgmt_class);
static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
struct ib_mad_agent_private *agent_priv);
+static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr,
+ u32 timeout_ms, u32 retries_left);
+
+
+/*
+ * Timeout FIFO functions - implements FIFO with timeout mechanism
+ */
+
+static void activate_timeout_handler_task(unsigned long data)
+{
+ struct to_fifo *tf;
+
+ tf = (struct to_fifo *)data;
+ del_timer(&tf->timer);
+ queue_work(tf->workq, &tf->work);
+}
+
+static unsigned long adjusted_time(unsigned long last, unsigned long next)
+{
+ unsigned long min_next;
+
+ min_next = last + msecs_to_jiffies(MIN_BETWEEN_ACTIVATIONS_MS);
+ if (time_after(min_next, next))
+ return min_next;
+
+ return next;
+}
+
+static void notify_failure(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_wc_status status)
+{
+ struct ib_mad_send_wc mad_send_wc;
+ struct ib_mad_agent_private *mad_agent_priv;
+
+ mad_send_wc.status = status;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc);
+}
+
+static inline struct sa_cc_data *
+get_cc_obj(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ return &mad_send_wr->mad_agent_priv->qp_info->port_priv->sa_cc;
+}
+
+static inline struct ib_mad_send_wr_private *tfe_to_mad(struct tf_entry *tfe)
+{
+ return container_of(tfe, struct ib_mad_send_wr_private, tf_list);
+}
+
+static void timeout_handler_task(struct work_struct *work)
+{
+ struct tf_entry *tmp1, *tmp2;
+ struct list_head *list_item, exp_lst;
+ unsigned long flags, curr_time;
+ int lst_empty;
+ struct to_fifo *tf;
+
+ tf = container_of(work, struct to_fifo, work);
+ do {
+ INIT_LIST_HEAD(&exp_lst);
+
+ spin_lock_irqsave(&tf->lists_lock, flags);
+ curr_time = jiffies;
+ list_for_each(list_item, &tf->to_head) {
+ tmp1 = list_entry(list_item, struct tf_entry, to_list);
+ if (time_before(curr_time, tmp1->exp_time))
+ break;
+ list_del(&tmp1->fifo_list);
+ tf->num_items--;
+ }
+
+ /* cut list up to and including list_item->prev */
+ list_cut_position(&exp_lst, &tf->to_head, list_item->prev);
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+
+ lst_empty = list_empty(&exp_lst);
+ list_for_each_entry_safe(tmp1, tmp2, &exp_lst, to_list) {
+ list_del(&tmp1->to_list);
+ if (tmp1->canceled) {
+ tmp1->canceled = 0;
+ notify_failure(tfe_to_mad(tmp1), IB_WC_WR_FLUSH_ERR);
+ } else {
+ notify_failure(tfe_to_mad(tmp1), IB_WC_RESP_TIMEOUT_ERR);
+ }
+ }
+ } while (!lst_empty);
+
+ spin_lock_irqsave(&tf->lists_lock, flags);
+ if (!list_empty(&tf->to_head)) {
+ tmp1 = list_entry(tf->to_head.next, struct tf_entry, to_list);
+ mod_timer(&tf->timer, adjusted_time(curr_time, tmp1->exp_time));
+ }
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+}
+
+/**
+ * tf_create - creates new timeout-fifo object
+ * @fifo_size: Maximum fifo size
+ *
+ * Allocate and initialize new timeout-fifo object
+ */
+static struct to_fifo *tf_create(u32 fifo_size)
+{
+ struct to_fifo *tf;
+
+ tf = kzalloc(sizeof(*tf), GFP_KERNEL);
+ if (tf) {
+ tf->workq = create_singlethread_workqueue("to_fifo");
+ if (!tf->workq) {
+ kfree(tf);
+ return NULL;
+ }
+ spin_lock_init(&tf->lists_lock);
+ INIT_LIST_HEAD(&tf->to_head);
+ INIT_LIST_HEAD(&tf->fifo_head);
+ init_timer(&tf->timer);
+ INIT_WORK(&tf->work, timeout_handler_task);
+ tf->timer.data = (unsigned long) tf;
+ tf->timer.function = activate_timeout_handler_task;
+ tf->timer.expires = jiffies;
+ tf->fifo_size = fifo_size;
+ tf->stop_enqueue = 0;
+ tf->num_items = 0;
+ }
+
+ return tf;
+}
+
+/**
+ * tf_enqueue - enqueue item to timeout-fifo object
+ * @tf:timeout-fifo object
+ * @item: item to enqueue.
+ * @timeout_ms: item expiration time in ms.
+ *
+ * Enqueue item to fifo and modify expiration timer when required.
+ *
+ * Returns 0 on success and negative on failure.
+ */
+static int tf_enqueue(struct to_fifo *tf, struct tf_entry *item, u32 timeout_ms)
+{
+ struct tf_entry *tmp;
+ struct list_head *list_item;
+ unsigned long flags;
+
+ item->exp_time = jiffies + msecs_to_jiffies(timeout_ms);
+
+ spin_lock_irqsave(&tf->lists_lock, flags);
+ if (tf->num_items >= tf->fifo_size || tf->stop_enqueue) {
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+ return -EBUSY;
+ }
+
+ /* Insert item to timeout list */
+ list_for_each_prev(list_item, &tf->to_head) {
+ tmp = list_entry(list_item, struct tf_entry, to_list);
+ if (time_after(item->exp_time, tmp->exp_time))
+ break;
+ }
+
+ list_add(&item->to_list, list_item);
+
+ /* Insert item to fifo list */
+ list_add_tail(&item->fifo_list, &tf->fifo_head);
+
+ tf->num_items++;
+
+ /* modify expiration timer if required */
+ if (list_item == &tf->to_head)
+ mod_timer(&tf->timer, item->exp_time);
+
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+
+ return 0;
+}
+
+/**
+ * tf_dequeue - dequeue item from timeout-fifo object
+ * @tf:timeout-fifo object
+ * @time_left_ms: returns the time left for expiration in ms.
+ *
+ * Dequeue item from fifo and modify expiration timer when required.
+ *
+ * Returns pointer to tf_entry on success and NULL on failure.
+ */
+static struct tf_entry *tf_dequeue(struct to_fifo *tf, u32 *time_left_ms)
+{
+ unsigned long flags;
+ unsigned long time_left;
+ struct tf_entry *tmp, *tmp1;
+
+ spin_lock_irqsave(&tf->lists_lock, flags);
+ if (list_empty(&tf->fifo_head)) {
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+ return NULL;
+ }
+
+ list_for_each_entry(tmp, &tf->fifo_head, fifo_list) {
+ if (!tmp->canceled)
+ break;
+ }
+
+ if (tmp->canceled) {
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+ return NULL;
+ }
+
+ /* modify timer in case enqueued item is the next to expire */
+ if (tf->to_head.next == &tmp->to_list) {
+ if (list_is_last(&tmp->to_list, &tf->to_head)) {
+ del_timer(&tf->timer);
+ } else {
+ tmp1 = list_entry(tmp->to_list.next, struct tf_entry, to_list);
+ mod_timer(&tf->timer, tmp1->exp_time);
+ }
+ }
+ list_del(&tmp->fifo_list);
+ list_del(&tmp->to_list);
+ tf->num_items--;
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+
+ time_left = tmp->exp_time - jiffies;
+ if ((long) time_left <= 0)
+ time_left = 0;
+ *time_left_ms = jiffies_to_msecs(time_left);
+
+ return tmp;
+}
+
+static void tf_stop_enqueue(struct to_fifo *tf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tf->lists_lock, flags);
+ tf->stop_enqueue = 1;
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+}
+
+/**
+ * tf_free - free empty timeout-fifo object
+ * @tf:timeout-fifo object
+ *
+ */
+static void tf_free(struct to_fifo *tf)
+{
+ del_timer_sync(&tf->timer);
+ flush_workqueue(tf->workq);
+ destroy_workqueue(tf->workq);
+
+ kfree(tf);
+}
+
+/**
+ * tf_free_agent - free MADs related to specific MAD agent from timeout-fifo
+ * @tf:timeout-fifo object
+ * @mad_agent_priv: MAD agent.
+ *
+ */
+static void tf_free_agent(struct to_fifo *tf, struct ib_mad_agent_private *mad_agent_priv)
+{
+ unsigned long flags;
+ struct tf_entry *tmp, *tmp1;
+ struct list_head tmp_head;
+
+ INIT_LIST_HEAD(&tmp_head);
+ spin_lock_irqsave(&tf->lists_lock, flags);
+ list_for_each_entry_safe(tmp, tmp1, &tf->fifo_head, fifo_list) {
+ if (tfe_to_mad(tmp)->mad_agent_priv == mad_agent_priv) {
+ list_del(&tmp->to_list);
+ list_move(&tmp->fifo_list, &tmp_head);
+ tf->num_items--;
+ }
+ }
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+
+ list_for_each_entry_safe(tmp, tmp1, &tmp_head, fifo_list) {
+ list_del(&tmp->fifo_list);
+ notify_failure(tfe_to_mad(tmp), IB_WC_WR_FLUSH_ERR);
+ }
+}
+
+/**
+ * tf_modify_item - to modify expiration time for specific item
+ * @tf:timeout-fifo object
+ * @mad_agent_priv: MAD agent.
+ * @send_buf: the MAD to modify in queue
+ * @timeout_ms: new timeout to set.
+ *
+ * Returns 0 if item found on list and -ENXIO if not.
+ *
+ * Note: The send_buf may point on MAD that is already released.
+ * Therefore we can't use this struct before finding it in the list
+ */
+static int tf_modify_item(struct to_fifo *tf,
+ struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+ struct tf_entry *tmp, *item;
+ struct list_head *list_item;
+ unsigned long flags;
+ int found = 0;
+
+ spin_lock_irqsave(&tf->lists_lock, flags);
+ list_for_each_entry(item, &tf->fifo_head, fifo_list) {
+ if (tfe_to_mad(item)->mad_agent_priv == mad_agent_priv &&
+ &tfe_to_mad(item)->send_buf == send_buf) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+ return -ENXIO;
+ }
+
+ item->exp_time = jiffies + msecs_to_jiffies(timeout_ms);
+
+ if (timeout_ms) {
+ list_del(&item->to_list);
+ list_for_each_prev(list_item, &tf->to_head) {
+ tmp = list_entry(list_item, struct tf_entry, to_list);
+ if (time_after(item->exp_time, tmp->exp_time))
+ break;
+ }
+ list_add(&item->to_list, list_item);
+
+ /* modify expiration timer if required */
+ if (list_item == &tf->to_head)
+ mod_timer(&tf->timer, item->exp_time);
+ } else {
+ /*
+ * when item canceled (timeout_ms == 0) move item to
+ * head of timeout list and to the tail of fifo list
+ */
+ item->canceled = 1;
+ list_move(&item->to_list, &tf->to_head);
+ list_move_tail(&item->fifo_list, &tf->fifo_head);
+ mod_timer(&tf->timer, item->exp_time);
+ }
+ spin_unlock_irqrestore(&tf->lists_lock, flags);
+
+ return 0;
+}
+
+/*
+ * SA congestion control functions
+ */
+
+/*
+ * Defines which MAD is under congestion control.
+ */
+static int is_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_hdr *mad;
+
+ mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad;
+
+ return ((mad_send_wr->send_buf.timeout_ms) &&
+ (mad->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) &&
+ ((mad->method == IB_MGMT_METHOD_GET) ||
+ (mad->method == IB_MGMT_METHOD_SET)));
+}
+
+/*
+ * Notify that SA congestion controlled MAD is done.
+ * to allow dequeuing SA MAD from congestion control queue.
+ */
+static void sa_cc_mad_done(struct sa_cc_data *cc_obj)
+{
+ unsigned long flags;
+ struct tf_entry *tfe;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ u32 time_left_ms, timeout_ms, retries;
+ int ret;
+
+ do {
+ spin_lock_irqsave(&cc_obj->lock, flags);
+ tfe = tf_dequeue(cc_obj->tf, &time_left_ms);
+ if (!tfe) {
+ if (cc_obj->outstanding > 0)
+ cc_obj->outstanding--;
+ spin_unlock_irqrestore(&cc_obj->lock, flags);
+ break;
+ }
+ spin_unlock_irqrestore(&cc_obj->lock, flags);
+ mad_send_wr = tfe_to_mad(tfe);
+ time_left_ms += MIN_TIME_FOR_SA_MAD_SEND_MS;
+ if (time_left_ms > mad_send_wr->send_buf.timeout_ms) {
+ retries = time_left_ms / mad_send_wr->send_buf.timeout_ms - 1;
+ timeout_ms = mad_send_wr->send_buf.timeout_ms;
+ } else {
+ retries = 0;
+ timeout_ms = time_left_ms;
+ }
+ ret = send_sa_cc_mad(mad_send_wr, timeout_ms, retries);
+ if (ret) {
+ if (ret == -ENOMEM)
+ notify_failure(mad_send_wr, IB_WC_GENERAL_ERR);
+ else
+ notify_failure(mad_send_wr, IB_WC_LOC_QP_OP_ERR);
+ }
+ } while (ret);
+}
+
+/*
+ * Send SA MAD under congestion control.
+ */
+static int sa_cc_mad_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ unsigned long flags;
+ int ret;
+ struct sa_cc_data *cc_obj;
+
+ cc_obj = get_cc_obj(mad_send_wr);
+ spin_lock_irqsave(&cc_obj->lock, flags);
+ if (cc_obj->outstanding < MAX_OUTSTANDING_SA_MADS) {
+ cc_obj->outstanding++;
+ spin_unlock_irqrestore(&cc_obj->lock, flags);
+ ret = send_sa_cc_mad(mad_send_wr, mad_send_wr->send_buf.timeout_ms,
+ mad_send_wr->retries_left);
+ if (ret)
+ sa_cc_mad_done(cc_obj);
+
+ } else {
+ int qtime = (mad_send_wr->send_buf.timeout_ms *
+ (mad_send_wr->retries_left + 1))
+ - MIN_TIME_FOR_SA_MAD_SEND_MS;
+
+ if (qtime < 0)
+ qtime = 0;
+ ret = tf_enqueue(cc_obj->tf, &mad_send_wr->tf_list, (u32)qtime);
+
+ spin_unlock_irqrestore(&cc_obj->lock, flags);
+ }
+
+ return ret;
+}
+
+/*
+ * Initialize SA congestion control.
+ */
+static int sa_cc_init(struct sa_cc_data *cc_obj)
+{
+ spin_lock_init(&cc_obj->lock);
+ cc_obj->outstanding = 0;
+ cc_obj->tf = tf_create(MAX_SA_MADS);
+ if (!cc_obj->tf)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Cancel SA MADs from congestion control queue.
+ */
+static void cancel_sa_cc_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ tf_free_agent(mad_agent_priv->qp_info->port_priv->sa_cc.tf,
+ mad_agent_priv);
+}
+
+/*
+ * Modify timeout of SA MAD on congestion control queue.
+ */
+static int modify_sa_cc_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+ int ret;
+ int qtime = 0;
+
+ if (timeout_ms > MIN_TIME_FOR_SA_MAD_SEND_MS)
+ qtime = timeout_ms - MIN_TIME_FOR_SA_MAD_SEND_MS;
+
+ ret = tf_modify_item(mad_agent_priv->qp_info->port_priv->sa_cc.tf,
+ mad_agent_priv, send_buf, (u32)qtime);
+ return ret;
+}
+
+static void sa_cc_destroy(struct sa_cc_data *cc_obj)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct tf_entry *tfe;
+ struct ib_mad_send_wc mad_send_wc;
+ struct ib_mad_agent_private *mad_agent_priv;
+ u32 time_left_ms;
+
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ tf_stop_enqueue(cc_obj->tf);
+ tfe = tf_dequeue(cc_obj->tf, &time_left_ms);
+ while (tfe) {
+ mad_send_wr = tfe_to_mad(tfe);
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ tfe = tf_dequeue(cc_obj->tf, &time_left_ms);
+ }
+ tf_free(cc_obj->tf);
+}
/*
* Returns a ib_mad_port_private structure or NULL for a device/port
@@ -184,15 +707,6 @@ int ib_response_mad(struct ib_mad *mad)
}
EXPORT_SYMBOL(ib_response_mad);
-static void timeout_callback(unsigned long data)
-{
- struct ib_mad_agent_private *mad_agent_priv =
- (struct ib_mad_agent_private *) data;
-
- queue_work(mad_agent_priv->qp_info->port_priv->wq,
- &mad_agent_priv->timeout_work);
-}
-
/*
* ib_register_mad_agent - Register to send/receive MADs
*/
@@ -285,6 +799,13 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
goto error1;
}
+ /* Verify the QP requested is supported. For example, Ethernet devices
+ * will not have QP0 */
+ if (!port_priv->qp_info[qpn].qp) {
+ ret = ERR_PTR(-EPROTONOSUPPORT);
+ goto error1;
+ }
+
/* Allocate structures */
mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
if (!mad_agent_priv) {
@@ -300,13 +821,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
}
if (mad_reg_req) {
- reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL);
+ reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
if (!reg_req) {
ret = ERR_PTR(-ENOMEM);
goto error3;
}
- /* Make a copy of the MAD registration request */
- memcpy(reg_req, mad_reg_req, sizeof *reg_req);
}
/* Now, fill in the various structures */
@@ -324,9 +843,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
INIT_LIST_HEAD(&mad_agent_priv->wait_list);
INIT_LIST_HEAD(&mad_agent_priv->done_list);
INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
- INIT_WORK(&mad_agent_priv->timeout_work, timeout_sends);
- setup_timer(&mad_agent_priv->timeout_timer, timeout_callback,
- (unsigned long) mad_agent_priv);
+ INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
INIT_LIST_HEAD(&mad_agent_priv->local_list);
INIT_WORK(&mad_agent_priv->local_work, local_completions);
atomic_set(&mad_agent_priv->refcount, 1);
@@ -533,8 +1050,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
*/
cancel_mads(mad_agent_priv);
port_priv = mad_agent_priv->qp_info->port_priv;
- del_timer_sync(&mad_agent_priv->timeout_timer);
- cancel_work_sync(&mad_agent_priv->timeout_work);
+ cancel_delayed_work(&mad_agent_priv->timed_work);
spin_lock_irqsave(&port_priv->reg_lock, flags);
remove_mad_reg_req(mad_agent_priv);
@@ -577,6 +1093,7 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_snoop_private *mad_snoop_priv;
+ if (!IS_ERR(mad_agent)) {
/* If the TID is zero, the agent can only snoop. */
if (mad_agent->hi_tid) {
mad_agent_priv = container_of(mad_agent,
@@ -589,6 +1106,8 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
agent);
unregister_mad_snoop(mad_snoop_priv);
}
+ }
+
return 0;
}
EXPORT_SYMBOL(ib_unregister_mad_agent);
@@ -695,7 +1214,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
struct ib_wc mad_wc;
struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
- if (device->node_type == RDMA_NODE_IB_SWITCH)
+ if (device->node_type == RDMA_NODE_IB_SWITCH &&
+ smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
port_num = send_wr->wr.ud.port_num;
else
port_num = mad_agent_priv->agent.port_num;
@@ -1028,12 +1548,20 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->send_buf.mad,
sge[0].length,
DMA_TO_DEVICE);
- mad_send_wr->header_mapping = sge[0].addr;
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
+ return -ENOMEM;
sge[1].addr = ib_dma_map_single(mad_agent->device,
ib_get_payload(mad_send_wr),
sge[1].length,
DMA_TO_DEVICE);
+
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
+ ret = -ENOMEM;
+ goto dma1_err;
+ }
+
+ mad_send_wr->header_mapping = sge[0].addr;
mad_send_wr->payload_mapping = sge[1].addr;
spin_lock_irqsave(&qp_info->send_queue.lock, flags);
@@ -1051,14 +1579,51 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
list_add_tail(&mad_send_wr->mad_list.list, list);
}
spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
- if (ret) {
+
+ if (!ret)
+ return 0;
+
ib_dma_unmap_single(mad_agent->device,
mad_send_wr->header_mapping,
- sge[0].length, DMA_TO_DEVICE);
+ sge[1].length, DMA_TO_DEVICE);
+dma1_err:
ib_dma_unmap_single(mad_agent->device,
mad_send_wr->payload_mapping,
- sge[1].length, DMA_TO_DEVICE);
+ sge[0].length, DMA_TO_DEVICE);
+ return ret;
+}
+
+/*
+ * Send SA MAD that passed congestion control
+ */
+static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr,
+ u32 timeout_ms, u32 retries_left)
+{
+ int ret;
+ unsigned long flags;
+ struct ib_mad_agent_private *mad_agent_priv;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ mad_send_wr->retries_left = retries_left;
+ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+
+ /* Reference MAD agent until send completes */
+ atomic_inc(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ ret = ib_send_mad(mad_send_wr);
+ if (ret < 0) {
+ /* Fail send request */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
}
+
return ret;
}
@@ -1125,6 +1690,12 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
mad_send_wr->status = IB_WC_SUCCESS;
+ if (is_sa_cc_mad(mad_send_wr)) {
+ mad_send_wr->is_sa_cc_mad = 1;
+ ret = sa_cc_mad_send(mad_send_wr);
+ if (ret < 0)
+ goto error;
+ } else {
/* Reference MAD agent until send completes */
atomic_inc(&mad_agent_priv->refcount);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
@@ -1147,6 +1718,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
goto error;
}
}
+ }
return 0;
error:
if (bad_send_buf)
@@ -1206,10 +1778,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method,
{
int i;
- for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS);
- i < IB_MGMT_MAX_METHODS;
- i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
- 1+i)) {
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
if ((*method)->agent[i]) {
printk(KERN_ERR PFX "Method %d already in use\n", i);
return -EINVAL;
@@ -1343,13 +1912,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
goto error3;
/* Finally, add in methods being registered */
- for (i = find_first_bit(mad_reg_req->method_mask,
- IB_MGMT_MAX_METHODS);
- i < IB_MGMT_MAX_METHODS;
- i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
- 1+i)) {
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
(*method)->agent[i] = agent_priv;
- }
+
return 0;
error3:
@@ -1442,13 +2007,9 @@ check_in_use:
goto error4;
/* Finally, add in methods being registered */
- for (i = find_first_bit(mad_reg_req->method_mask,
- IB_MGMT_MAX_METHODS);
- i < IB_MGMT_MAX_METHODS;
- i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
- 1+i)) {
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
(*method)->agent[i] = agent_priv;
- }
+
return 0;
error4:
@@ -1614,6 +2175,9 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
mad->mad_hdr.class_version].class;
if (!class)
goto out;
+ if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >=
+ IB_MGMT_MAX_METHODS)
+ goto out;
method = class->method_table[convert_mgmt_class(
mad->mad_hdr.mgmt_class)];
if (method)
@@ -1856,6 +2420,26 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
}
+static bool generate_unmatched_resp(struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET ||
+ recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) {
+ memcpy(response, recv, sizeof *response);
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+ response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+ response->mad.mad.mad_hdr.status =
+ cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+ if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION;
+
+ return true;
+ } else {
+ return false;
+ }
+}
static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
struct ib_wc *wc)
{
@@ -1865,6 +2449,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
struct ib_mad_list_head *mad_list;
struct ib_mad_agent_private *mad_agent;
int port_num;
+ int ret = IB_MAD_RESULT_SUCCESS;
mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
qp_info = mad_list->mad_queue->qp_info;
@@ -1948,8 +2533,6 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
local:
/* Give driver "right of first refusal" on incoming MAD */
if (port_priv->device->process_mad) {
- int ret;
-
ret = port_priv->device->process_mad(port_priv->device, 0,
port_priv->port_num,
wc, &recv->grh,
@@ -1977,6 +2560,10 @@ local:
* or via recv_handler in ib_mad_complete_recv()
*/
recv = NULL;
+ } else if ((ret & IB_MAD_RESULT_SUCCESS) &&
+ generate_unmatched_resp(recv, response)) {
+ agent_send_response(&response->mad.mad, &recv->grh, wc,
+ port_priv->device, port_num, qp_info->qp->qp_num);
}
out:
@@ -1992,9 +2579,10 @@ out:
static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
{
struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long delay;
if (list_empty(&mad_agent_priv->wait_list)) {
- del_timer(&mad_agent_priv->timeout_timer);
+ cancel_delayed_work(&mad_agent_priv->timed_work);
} else {
mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
struct ib_mad_send_wr_private,
@@ -2003,8 +2591,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
if (time_after(mad_agent_priv->timeout,
mad_send_wr->timeout)) {
mad_agent_priv->timeout = mad_send_wr->timeout;
- mod_timer(&mad_agent_priv->timeout_timer,
- mad_send_wr->timeout);
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
}
}
}
@@ -2031,14 +2622,15 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
temp_mad_send_wr->timeout))
break;
}
- } else
+ }
+ else
list_item = &mad_agent_priv->wait_list;
list_add(&mad_send_wr->agent_list, list_item);
/* Reschedule a work item if we have a shorter timeout */
if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
- mod_timer(&mad_agent_priv->timeout_timer,
- mad_send_wr->timeout);
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
}
void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
@@ -2090,9 +2682,12 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
mad_send_wc->status = mad_send_wr->status;
if (ret == IB_RMPP_RESULT_INTERNAL)
ib_rmpp_send_handler(mad_send_wc);
- else
+ else {
+ if (mad_send_wr->is_sa_cc_mad)
+ sa_cc_mad_done(get_cc_obj(mad_send_wr));
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
mad_send_wc);
+ }
/* Release reference on agent taken when sending */
deref_mad_agent(mad_agent_priv);
@@ -2272,6 +2867,7 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
INIT_LIST_HEAD(&cancel_list);
+ cancel_sa_cc_mads(mad_agent_priv);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
&mad_agent_priv->send_list, agent_list) {
@@ -2293,6 +2889,8 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
&cancel_list, agent_list) {
mad_send_wc.send_buf = &mad_send_wr->send_buf;
list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->is_sa_cc_mad)
+ sa_cc_mad_done(get_cc_obj(mad_send_wr));
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
&mad_send_wc);
atomic_dec(&mad_agent_priv->refcount);
@@ -2332,7 +2930,13 @@ int ib_modify_mad(struct ib_mad_agent *mad_agent,
agent);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
- if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ if (!mad_send_wr) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ if (modify_sa_cc_mad(mad_agent_priv, send_buf, timeout_ms))
+ return -EINVAL;
+ return 0;
+ }
+ if (mad_send_wr->status != IB_WC_SUCCESS) {
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
return -EINVAL;
}
@@ -2482,10 +3086,10 @@ static void timeout_sends(struct work_struct *work)
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_send_wr_private *mad_send_wr;
struct ib_mad_send_wc mad_send_wc;
- unsigned long flags;
+ unsigned long flags, delay;
mad_agent_priv = container_of(work, struct ib_mad_agent_private,
- timeout_work);
+ timed_work.work);
mad_send_wc.vendor_err = 0;
spin_lock_irqsave(&mad_agent_priv->lock, flags);
@@ -2495,8 +3099,12 @@ static void timeout_sends(struct work_struct *work)
agent_list);
if (time_after(mad_send_wr->timeout, jiffies)) {
- mod_timer(&mad_agent_priv->timeout_timer,
- mad_send_wr->timeout);
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(mad_agent_priv->qp_info->
+ port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
break;
}
@@ -2512,6 +3120,8 @@ static void timeout_sends(struct work_struct *work)
else
mad_send_wc.status = mad_send_wr->status;
mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ if (mad_send_wr->is_sa_cc_mad)
+ sa_cc_mad_done(get_cc_obj(mad_send_wr));
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
&mad_send_wc);
@@ -2572,6 +3182,14 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
sizeof *mad_priv -
sizeof mad_priv->header,
DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
+ sg_list.addr))) {
+ ret = -ENOMEM;
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ printk(KERN_ERR PFX "ib_dma_map_single failed\n");
+ break;
+ }
+
mad_priv->header.mapping = sg_list.addr;
recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
mad_priv->header.mad_list.mad_queue = recv_queue;
@@ -2645,6 +3263,7 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
int ret, i;
struct ib_qp_attr *attr;
struct ib_qp *qp;
+ u16 pkey_index = 0;
attr = kmalloc(sizeof *attr, GFP_KERNEL);
if (!attr) {
@@ -2652,6 +3271,11 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
return -ENOMEM;
}
+ ret = ib_find_pkey(port_priv->device, port_priv->port_num,
+ 0xFFFF, &pkey_index);
+ if (ret)
+ pkey_index = 0;
+
for (i = 0; i < IB_MAD_QPS_CORE; i++) {
qp = port_priv->qp_info[i].qp;
if (!qp)
@@ -2662,7 +3286,7 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
* one is needed for the Reset to Init transition
*/
attr->qp_state = IB_QPS_INIT;
- attr->pkey_index = 0;
+ attr->pkey_index = pkey_index;
attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
ret = ib_modify_qp(qp, attr, IB_QP_STATE |
IB_QP_PKEY_INDEX | IB_QP_QKEY);
@@ -2858,6 +3482,10 @@ static int ib_mad_port_open(struct ib_device *device,
}
INIT_WORK(&port_priv->work, ib_mad_completion_handler);
+ if (sa_cc_init(&port_priv->sa_cc))
+ goto error9;
+
+
spin_lock_irqsave(&ib_mad_port_list_lock, flags);
list_add_tail(&port_priv->port_list, &ib_mad_port_list);
spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
@@ -2865,17 +3493,19 @@ static int ib_mad_port_open(struct ib_device *device,
ret = ib_mad_port_start(port_priv);
if (ret) {
printk(KERN_ERR PFX "Couldn't start port\n");
- goto error9;
+ goto error10;
}
return 0;
-error9:
+error10:
spin_lock_irqsave(&ib_mad_port_list_lock, flags);
list_del_init(&port_priv->port_list);
spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
destroy_workqueue(port_priv->wq);
+error9:
+ sa_cc_destroy(&port_priv->sa_cc);
error8:
destroy_mad_qp(&port_priv->qp_info[1]);
error7:
@@ -2915,6 +3545,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
destroy_workqueue(port_priv->wq);
+ sa_cc_destroy(&port_priv->sa_cc);
destroy_mad_qp(&port_priv->qp_info[1]);
destroy_mad_qp(&port_priv->qp_info[0]);
ib_dereg_mr(port_priv->mr);
@@ -2983,6 +3614,9 @@ static void ib_mad_remove_device(struct ib_device *device)
{
int i, num_ports, cur_port;
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
if (device->node_type == RDMA_NODE_IB_SWITCH) {
num_ports = 1;
cur_port = 0;
@@ -3017,8 +3651,6 @@ static int __init ib_mad_init_module(void)
mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
- spin_lock_init(&ib_mad_port_list_lock);
-
ib_mad_cache = kmem_cache_create("ib_mad",
sizeof(struct ib_mad_private),
0,
@@ -3054,4 +3686,3 @@ static void __exit ib_mad_cleanup_module(void)
module_init(ib_mad_init_module);
module_exit(ib_mad_cleanup_module);
-
diff --git a/sys/ofed/drivers/infiniband/core/mad_priv.h b/sys/ofed/drivers/infiniband/core/mad_priv.h
index 8b4df0a..e2cd0ac 100644
--- a/sys/ofed/drivers/infiniband/core/mad_priv.h
+++ b/sys/ofed/drivers/infiniband/core/mad_priv.h
@@ -102,8 +102,7 @@ struct ib_mad_agent_private {
struct list_head send_list;
struct list_head wait_list;
struct list_head done_list;
- struct work_struct timeout_work;
- struct timer_list timeout_timer;
+ struct delayed_work timed_work;
unsigned long timeout;
struct list_head local_list;
struct work_struct local_work;
@@ -122,6 +121,14 @@ struct ib_mad_snoop_private {
struct completion comp;
};
+/* Structure for timeout-fifo entry */
+struct tf_entry {
+ unsigned long exp_time; /* entry expiration time */
+ struct list_head fifo_list; /* to keep entries in fifo order */
+ struct list_head to_list; /* to keep entries in timeout order */
+ int canceled; /* indicates whether entry is canceled */
+};
+
struct ib_mad_send_wr_private {
struct ib_mad_list_head mad_list;
struct list_head agent_list;
@@ -147,6 +154,10 @@ struct ib_mad_send_wr_private {
int seg_num;
int newwin;
int pad;
+
+ /* SA congestion controlled MAD */
+ int is_sa_cc_mad;
+ struct tf_entry tf_list;
};
struct ib_mad_local_private {
@@ -198,6 +209,25 @@ struct ib_mad_qp_info {
atomic_t snoop_count;
};
+struct to_fifo {
+ struct list_head to_head;
+ struct list_head fifo_head;
+ spinlock_t lists_lock;
+ struct timer_list timer;
+ struct work_struct work;
+ u32 fifo_size;
+ u32 num_items;
+ int stop_enqueue;
+ struct workqueue_struct *workq;
+};
+
+/* SA congestion control data */
+struct sa_cc_data {
+ spinlock_t lock;
+ unsigned long outstanding;
+ struct to_fifo *tf;
+};
+
struct ib_mad_port_private {
struct list_head port_list;
struct ib_device *device;
@@ -212,6 +242,7 @@ struct ib_mad_port_private {
struct workqueue_struct *wq;
struct work_struct work;
struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+ struct sa_cc_data sa_cc;
};
int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
diff --git a/sys/ofed/drivers/infiniband/core/mad_rmpp.c b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
index 4e0f282..f37878c 100644
--- a/sys/ofed/drivers/infiniband/core/mad_rmpp.c
+++ b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
@@ -31,6 +31,8 @@
* SOFTWARE.
*/
+#include <linux/slab.h>
+
#include "mad_priv.h"
#include "mad_rmpp.h"
diff --git a/sys/ofed/drivers/infiniband/core/multicast.c b/sys/ofed/drivers/infiniband/core/multicast.c
index f8d7ef8..ef595b2 100644
--- a/sys/ofed/drivers/infiniband/core/multicast.c
+++ b/sys/ofed/drivers/infiniband/core/multicast.c
@@ -34,12 +34,27 @@
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include <linux/rbtree.h>
#include <rdma/ib_cache.h>
#include "sa.h"
+static int mcast_leave_retries = 3;
+
+/*static const struct kernel_param_ops retry_ops = {
+ .set = param_set_int,
+ .get = param_get_int,
+};
+
+module_param_cb(mcast_leave_retries, &retry_ops, &mcast_leave_retries, 0644);
+MODULE_PARM_DESC(mcast_leave_retries, "Number of retries for multicast leave "
+ "requests before giving up (default: 3)");
+*/
static void mcast_add_one(struct ib_device *device);
static void mcast_remove_one(struct ib_device *device);
@@ -250,6 +265,34 @@ static u8 get_leave_state(struct mcast_group *group)
return leave_state & group->rec.join_state;
}
+static int check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 selector, u8 src_value, u8 dst_value)
+{
+ int err;
+
+ if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+ return 0;
+
+ switch (selector) {
+ case IB_SA_GT:
+ err = (src_value <= dst_value);
+ break;
+ case IB_SA_LT:
+ err = (src_value >= dst_value);
+ break;
+ case IB_SA_EQ:
+ err = (src_value != dst_value);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
static int cmp_rec(struct ib_sa_mcmember_rec *src,
struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
{
@@ -262,7 +305,7 @@ static int cmp_rec(struct ib_sa_mcmember_rec *src,
return -EINVAL;
if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
return -EINVAL;
- if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
src->mtu, dst->mtu))
return -EINVAL;
@@ -271,11 +314,11 @@ static int cmp_rec(struct ib_sa_mcmember_rec *src,
return -EINVAL;
if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
return -EINVAL;
- if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
src->rate, dst->rate))
return -EINVAL;
- if (ib_sa_check_selector(comp_mask,
+ if (check_selector(comp_mask,
IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
dst->packet_life_time_selector,
@@ -517,11 +560,15 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
{
struct mcast_group *group = context;
- if (status && (group->retries > 0) &&
+ if (status && group->retries > 0 &&
!send_leave(group, group->leave_state))
group->retries--;
- else
+ else {
+ if (status && group->retries <= 0)
+ printk(KERN_WARNING "reached max retry count. "
+ "status=%d. Giving up\n", status);
mcast_work_handler(&group->work);
+ }
}
static struct mcast_group *acquire_group(struct mcast_port *port,
@@ -544,7 +591,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port,
if (!group)
return NULL;
- group->retries = 3;
+ group->retries = mcast_leave_retries;
group->port = port;
group->rec.mgid = *mgid;
group->pkey_index = MCAST_INVALID_PKEY_INDEX;
@@ -754,7 +801,6 @@ static void mcast_event_handler(struct ib_event_handler *handler,
switch (event->event) {
case IB_EVENT_PORT_ERR:
case IB_EVENT_LID_CHANGE:
- case IB_EVENT_SM_CHANGE:
case IB_EVENT_CLIENT_REREGISTER:
mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
break;
diff --git a/sys/ofed/drivers/infiniband/core/notice.c b/sys/ofed/drivers/infiniband/core/notice.c
deleted file mode 100644
index ca91d96d..0000000
--- a/sys/ofed/drivers/infiniband/core/notice.c
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * Copyright (c) 2006 Intel Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/completion.h>
-#include <linux/dma-mapping.h>
-#include <linux/err.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/bitops.h>
-#include <linux/random.h>
-
-#include "sa.h"
-
-MODULE_AUTHOR("Sean Hefty");
-MODULE_DESCRIPTION("InfiniBand InformInfo & Notice event handling");
-MODULE_LICENSE("Dual BSD/GPL");
-
-static void inform_add_one(struct ib_device *device);
-static void inform_remove_one(struct ib_device *device);
-
-static struct ib_client inform_client = {
- .name = "ib_notice",
- .add = inform_add_one,
- .remove = inform_remove_one
-};
-
-static struct ib_sa_client sa_client;
-static struct workqueue_struct *inform_wq;
-
-struct inform_device;
-
-struct inform_port {
- struct inform_device *dev;
- spinlock_t lock;
- struct rb_root table;
- atomic_t refcount;
- struct completion comp;
- u8 port_num;
-};
-
-struct inform_device {
- struct ib_device *device;
- struct ib_event_handler event_handler;
- int start_port;
- int end_port;
- struct inform_port port[0];
-};
-
-enum inform_state {
- INFORM_IDLE,
- INFORM_REGISTERING,
- INFORM_MEMBER,
- INFORM_BUSY,
- INFORM_ERROR
-};
-
-struct inform_member;
-
-struct inform_group {
- u16 trap_number;
- struct rb_node node;
- struct inform_port *port;
- spinlock_t lock;
- struct work_struct work;
- struct list_head pending_list;
- struct list_head active_list;
- struct list_head notice_list;
- struct inform_member *last_join;
- int members;
- enum inform_state join_state; /* State relative to SA */
- atomic_t refcount;
- enum inform_state state;
- struct ib_sa_query *query;
- int query_id;
-};
-
-struct inform_member {
- struct ib_inform_info info;
- struct ib_sa_client *client;
- struct inform_group *group;
- struct list_head list;
- enum inform_state state;
- atomic_t refcount;
- struct completion comp;
-};
-
-struct inform_notice {
- struct list_head list;
- struct ib_sa_notice notice;
-};
-
-static void reg_handler(int status, struct ib_sa_inform *inform,
- void *context);
-static void unreg_handler(int status, struct ib_sa_inform *inform,
- void *context);
-
-static struct inform_group *inform_find(struct inform_port *port,
- u16 trap_number)
-{
- struct rb_node *node = port->table.rb_node;
- struct inform_group *group;
-
- while (node) {
- group = rb_entry(node, struct inform_group, node);
- if (trap_number < group->trap_number)
- node = node->rb_left;
- else if (trap_number > group->trap_number)
- node = node->rb_right;
- else
- return group;
- }
- return NULL;
-}
-
-static struct inform_group *inform_insert(struct inform_port *port,
- struct inform_group *group)
-{
- struct rb_node **link = &port->table.rb_node;
- struct rb_node *parent = NULL;
- struct inform_group *cur_group;
-
- while (*link) {
- parent = *link;
- cur_group = rb_entry(parent, struct inform_group, node);
- if (group->trap_number < cur_group->trap_number)
- link = &(*link)->rb_left;
- else if (group->trap_number > cur_group->trap_number)
- link = &(*link)->rb_right;
- else
- return cur_group;
- }
- rb_link_node(&group->node, parent, link);
- rb_insert_color(&group->node, &port->table);
- return NULL;
-}
-
-static void deref_port(struct inform_port *port)
-{
- if (atomic_dec_and_test(&port->refcount))
- complete(&port->comp);
-}
-
-static void release_group(struct inform_group *group)
-{
- struct inform_port *port = group->port;
- unsigned long flags;
-
- spin_lock_irqsave(&port->lock, flags);
- if (atomic_dec_and_test(&group->refcount)) {
- rb_erase(&group->node, &port->table);
- spin_unlock_irqrestore(&port->lock, flags);
- kfree(group);
- deref_port(port);
- } else
- spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void deref_member(struct inform_member *member)
-{
- if (atomic_dec_and_test(&member->refcount))
- complete(&member->comp);
-}
-
-static void queue_reg(struct inform_member *member)
-{
- struct inform_group *group = member->group;
- unsigned long flags;
-
- spin_lock_irqsave(&group->lock, flags);
- list_add(&member->list, &group->pending_list);
- if (group->state == INFORM_IDLE) {
- group->state = INFORM_BUSY;
- atomic_inc(&group->refcount);
- queue_work(inform_wq, &group->work);
- }
- spin_unlock_irqrestore(&group->lock, flags);
-}
-
-static int send_reg(struct inform_group *group, struct inform_member *member)
-{
- struct inform_port *port = group->port;
- struct ib_sa_inform inform;
- int ret;
-
- memset(&inform, 0, sizeof inform);
- inform.lid_range_begin = cpu_to_be16(0xFFFF);
- inform.is_generic = 1;
- inform.subscribe = 1;
- inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
- inform.trap.generic.trap_num = cpu_to_be16(member->info.trap_number);
- inform.trap.generic.resp_time = 19;
- inform.trap.generic.producer_type =
- cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
-
- group->last_join = member;
- ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
- port->port_num, &inform, 3000, GFP_KERNEL,
- reg_handler, group,&group->query);
- if (ret >= 0) {
- group->query_id = ret;
- ret = 0;
- }
- return ret;
-}
-
-static int send_unreg(struct inform_group *group)
-{
- struct inform_port *port = group->port;
- struct ib_sa_inform inform;
- int ret;
-
- memset(&inform, 0, sizeof inform);
- inform.lid_range_begin = cpu_to_be16(0xFFFF);
- inform.is_generic = 1;
- inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
- inform.trap.generic.trap_num = cpu_to_be16(group->trap_number);
- inform.trap.generic.qpn = IB_QP1;
- inform.trap.generic.resp_time = 19;
- inform.trap.generic.producer_type =
- cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
-
- ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
- port->port_num, &inform, 3000, GFP_KERNEL,
- unreg_handler, group, &group->query);
- if (ret >= 0) {
- group->query_id = ret;
- ret = 0;
- }
- return ret;
-}
-
-static void join_group(struct inform_group *group, struct inform_member *member)
-{
- member->state = INFORM_MEMBER;
- group->members++;
- list_move(&member->list, &group->active_list);
-}
-
-static int fail_join(struct inform_group *group, struct inform_member *member,
- int status)
-{
- spin_lock_irq(&group->lock);
- list_del_init(&member->list);
- spin_unlock_irq(&group->lock);
- return member->info.callback(status, &member->info, NULL);
-}
-
-static void process_group_error(struct inform_group *group)
-{
- struct inform_member *member;
- int ret;
-
- spin_lock_irq(&group->lock);
- while (!list_empty(&group->active_list)) {
- member = list_entry(group->active_list.next,
- struct inform_member, list);
- atomic_inc(&member->refcount);
- list_del_init(&member->list);
- group->members--;
- member->state = INFORM_ERROR;
- spin_unlock_irq(&group->lock);
-
- ret = member->info.callback(-ENETRESET, &member->info, NULL);
- deref_member(member);
- if (ret)
- ib_sa_unregister_inform_info(&member->info);
- spin_lock_irq(&group->lock);
- }
-
- group->join_state = INFORM_IDLE;
- group->state = INFORM_BUSY;
- spin_unlock_irq(&group->lock);
-}
-
-/*
- * Report a notice to all active subscribers. We use a temporary list to
- * handle unsubscription requests while the notice is being reported, which
- * avoids holding the group lock while in the user's callback.
- */
-static void process_notice(struct inform_group *group,
- struct inform_notice *info_notice)
-{
- struct inform_member *member;
- struct list_head list;
- int ret;
-
- INIT_LIST_HEAD(&list);
-
- spin_lock_irq(&group->lock);
- list_splice_init(&group->active_list, &list);
- while (!list_empty(&list)) {
-
- member = list_entry(list.next, struct inform_member, list);
- atomic_inc(&member->refcount);
- list_move(&member->list, &group->active_list);
- spin_unlock_irq(&group->lock);
-
- ret = member->info.callback(0, &member->info,
- &info_notice->notice);
- deref_member(member);
- if (ret)
- ib_sa_unregister_inform_info(&member->info);
- spin_lock_irq(&group->lock);
- }
- spin_unlock_irq(&group->lock);
-}
-
-static void inform_work_handler(struct work_struct *work)
-{
- struct inform_group *group;
- struct inform_member *member;
- struct ib_inform_info *info;
- struct inform_notice *info_notice;
- int status, ret;
-
- group = container_of(work, typeof(*group), work);
-retest:
- spin_lock_irq(&group->lock);
- while (!list_empty(&group->pending_list) ||
- !list_empty(&group->notice_list) ||
- (group->state == INFORM_ERROR)) {
-
- if (group->state == INFORM_ERROR) {
- spin_unlock_irq(&group->lock);
- process_group_error(group);
- goto retest;
- }
-
- if (!list_empty(&group->notice_list)) {
- info_notice = list_entry(group->notice_list.next,
- struct inform_notice, list);
- list_del(&info_notice->list);
- spin_unlock_irq(&group->lock);
- process_notice(group, info_notice);
- kfree(info_notice);
- goto retest;
- }
-
- member = list_entry(group->pending_list.next,
- struct inform_member, list);
- info = &member->info;
- atomic_inc(&member->refcount);
-
- if (group->join_state == INFORM_MEMBER) {
- join_group(group, member);
- spin_unlock_irq(&group->lock);
- ret = info->callback(0, info, NULL);
- } else {
- spin_unlock_irq(&group->lock);
- status = send_reg(group, member);
- if (!status) {
- deref_member(member);
- return;
- }
- ret = fail_join(group, member, status);
- }
-
- deref_member(member);
- if (ret)
- ib_sa_unregister_inform_info(&member->info);
- spin_lock_irq(&group->lock);
- }
-
- if (!group->members && (group->join_state == INFORM_MEMBER)) {
- group->join_state = INFORM_IDLE;
- spin_unlock_irq(&group->lock);
- if (send_unreg(group))
- goto retest;
- } else {
- group->state = INFORM_IDLE;
- spin_unlock_irq(&group->lock);
- release_group(group);
- }
-}
-
-/*
- * Fail a join request if it is still active - at the head of the pending queue.
- */
-static void process_join_error(struct inform_group *group, int status)
-{
- struct inform_member *member;
- int ret;
-
- spin_lock_irq(&group->lock);
- member = list_entry(group->pending_list.next,
- struct inform_member, list);
- if (group->last_join == member) {
- atomic_inc(&member->refcount);
- list_del_init(&member->list);
- spin_unlock_irq(&group->lock);
- ret = member->info.callback(status, &member->info, NULL);
- deref_member(member);
- if (ret)
- ib_sa_unregister_inform_info(&member->info);
- } else
- spin_unlock_irq(&group->lock);
-}
-
-static void reg_handler(int status, struct ib_sa_inform *inform, void *context)
-{
- struct inform_group *group = context;
-
- if (status)
- process_join_error(group, status);
- else
- group->join_state = INFORM_MEMBER;
-
- inform_work_handler(&group->work);
-}
-
-static void unreg_handler(int status, struct ib_sa_inform *rec, void *context)
-{
- struct inform_group *group = context;
-
- inform_work_handler(&group->work);
-}
-
-int notice_dispatch(struct ib_device *device, u8 port_num,
- struct ib_sa_notice *notice)
-{
- struct inform_device *dev;
- struct inform_port *port;
- struct inform_group *group;
- struct inform_notice *info_notice;
-
- dev = ib_get_client_data(device, &inform_client);
- if (!dev)
- return 0; /* No one to give notice to. */
-
- port = &dev->port[port_num - dev->start_port];
- spin_lock_irq(&port->lock);
- group = inform_find(port, __be16_to_cpu(notice->trap.
- generic.trap_num));
- if (!group) {
- spin_unlock_irq(&port->lock);
- return 0;
- }
-
- atomic_inc(&group->refcount);
- spin_unlock_irq(&port->lock);
-
- info_notice = kmalloc(sizeof *info_notice, GFP_KERNEL);
- if (!info_notice) {
- release_group(group);
- return -ENOMEM;
- }
-
- info_notice->notice = *notice;
-
- spin_lock_irq(&group->lock);
- list_add(&info_notice->list, &group->notice_list);
- if (group->state == INFORM_IDLE) {
- group->state = INFORM_BUSY;
- spin_unlock_irq(&group->lock);
- inform_work_handler(&group->work);
- } else {
- spin_unlock_irq(&group->lock);
- release_group(group);
- }
-
- return 0;
-}
-
-static struct inform_group *acquire_group(struct inform_port *port,
- u16 trap_number, gfp_t gfp_mask)
-{
- struct inform_group *group, *cur_group;
- unsigned long flags;
-
- spin_lock_irqsave(&port->lock, flags);
- group = inform_find(port, trap_number);
- if (group)
- goto found;
- spin_unlock_irqrestore(&port->lock, flags);
-
- group = kzalloc(sizeof *group, gfp_mask);
- if (!group)
- return NULL;
-
- group->port = port;
- group->trap_number = trap_number;
- INIT_LIST_HEAD(&group->pending_list);
- INIT_LIST_HEAD(&group->active_list);
- INIT_LIST_HEAD(&group->notice_list);
- INIT_WORK(&group->work, inform_work_handler);
- spin_lock_init(&group->lock);
-
- spin_lock_irqsave(&port->lock, flags);
- cur_group = inform_insert(port, group);
- if (cur_group) {
- kfree(group);
- group = cur_group;
- } else
- atomic_inc(&port->refcount);
-found:
- atomic_inc(&group->refcount);
- spin_unlock_irqrestore(&port->lock, flags);
- return group;
-}
-
-/*
- * We serialize all join requests to a single group to make our lives much
- * easier. Otherwise, two users could try to join the same group
- * simultaneously, with different configurations, one could leave while the
- * join is in progress, etc., which makes locking around error recovery
- * difficult.
- */
-struct ib_inform_info *
-ib_sa_register_inform_info(struct ib_sa_client *client,
- struct ib_device *device, u8 port_num,
- u16 trap_number, gfp_t gfp_mask,
- int (*callback)(int status,
- struct ib_inform_info *info,
- struct ib_sa_notice *notice),
- void *context)
-{
- struct inform_device *dev;
- struct inform_member *member;
- struct ib_inform_info *info;
- int ret;
-
- dev = ib_get_client_data(device, &inform_client);
- if (!dev)
- return ERR_PTR(-ENODEV);
-
- member = kzalloc(sizeof *member, gfp_mask);
- if (!member)
- return ERR_PTR(-ENOMEM);
-
- ib_sa_client_get(client);
- member->client = client;
- member->info.trap_number = trap_number;
- member->info.callback = callback;
- member->info.context = context;
- init_completion(&member->comp);
- atomic_set(&member->refcount, 1);
- member->state = INFORM_REGISTERING;
-
- member->group = acquire_group(&dev->port[port_num - dev->start_port],
- trap_number, gfp_mask);
- if (!member->group) {
- ret = -ENOMEM;
- goto err;
- }
-
- /*
- * The user will get the info structure in their callback. They
- * could then free the info structure before we can return from
- * this routine. So we save the pointer to return before queuing
- * any callback.
- */
- info = &member->info;
- queue_reg(member);
- return info;
-
-err:
- ib_sa_client_put(member->client);
- kfree(member);
- return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(ib_sa_register_inform_info);
-
-void ib_sa_unregister_inform_info(struct ib_inform_info *info)
-{
- struct inform_member *member;
- struct inform_group *group;
-
- member = container_of(info, struct inform_member, info);
- group = member->group;
-
- spin_lock_irq(&group->lock);
- if (member->state == INFORM_MEMBER)
- group->members--;
-
- list_del_init(&member->list);
-
- if (group->state == INFORM_IDLE) {
- group->state = INFORM_BUSY;
- spin_unlock_irq(&group->lock);
- /* Continue to hold reference on group until callback */
- queue_work(inform_wq, &group->work);
- } else {
- spin_unlock_irq(&group->lock);
- release_group(group);
- }
-
- deref_member(member);
- wait_for_completion(&member->comp);
- ib_sa_client_put(member->client);
- kfree(member);
-}
-EXPORT_SYMBOL(ib_sa_unregister_inform_info);
-
-static void inform_groups_lost(struct inform_port *port)
-{
- struct inform_group *group;
- struct rb_node *node;
- unsigned long flags;
-
- spin_lock_irqsave(&port->lock, flags);
- for (node = rb_first(&port->table); node; node = rb_next(node)) {
- group = rb_entry(node, struct inform_group, node);
- spin_lock(&group->lock);
- if (group->state == INFORM_IDLE) {
- atomic_inc(&group->refcount);
- queue_work(inform_wq, &group->work);
- }
- group->state = INFORM_ERROR;
- spin_unlock(&group->lock);
- }
- spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void inform_event_handler(struct ib_event_handler *handler,
- struct ib_event *event)
-{
- struct inform_device *dev;
-
- dev = container_of(handler, struct inform_device, event_handler);
-
- switch (event->event) {
- case IB_EVENT_PORT_ERR:
- case IB_EVENT_LID_CHANGE:
- case IB_EVENT_SM_CHANGE:
- case IB_EVENT_CLIENT_REREGISTER:
- inform_groups_lost(&dev->port[event->element.port_num -
- dev->start_port]);
- break;
- default:
- break;
- }
-}
-
-static void inform_add_one(struct ib_device *device)
-{
- struct inform_device *dev;
- struct inform_port *port;
- int i;
-
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
- dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
- GFP_KERNEL);
- if (!dev)
- return;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH)
- dev->start_port = dev->end_port = 0;
- else {
- dev->start_port = 1;
- dev->end_port = device->phys_port_cnt;
- }
-
- for (i = 0; i <= dev->end_port - dev->start_port; i++) {
- port = &dev->port[i];
- port->dev = dev;
- port->port_num = dev->start_port + i;
- spin_lock_init(&port->lock);
- port->table = RB_ROOT;
- init_completion(&port->comp);
- atomic_set(&port->refcount, 1);
- }
-
- dev->device = device;
- ib_set_client_data(device, &inform_client, dev);
-
- INIT_IB_EVENT_HANDLER(&dev->event_handler, device, inform_event_handler);
- ib_register_event_handler(&dev->event_handler);
-}
-
-static void inform_remove_one(struct ib_device *device)
-{
- struct inform_device *dev;
- struct inform_port *port;
- int i;
-
- dev = ib_get_client_data(device, &inform_client);
- if (!dev)
- return;
-
- ib_unregister_event_handler(&dev->event_handler);
- flush_workqueue(inform_wq);
-
- for (i = 0; i <= dev->end_port - dev->start_port; i++) {
- port = &dev->port[i];
- deref_port(port);
- wait_for_completion(&port->comp);
- }
-
- kfree(dev);
-}
-
-int notice_init(void)
-{
- int ret;
-
- inform_wq = create_singlethread_workqueue("ib_inform");
- if (!inform_wq)
- return -ENOMEM;
-
- ib_sa_register_client(&sa_client);
-
- ret = ib_register_client(&inform_client);
- if (ret)
- goto err;
- return 0;
-
-err:
- ib_sa_unregister_client(&sa_client);
- destroy_workqueue(inform_wq);
- return ret;
-}
-
-void notice_cleanup(void)
-{
- ib_unregister_client(&inform_client);
- ib_sa_unregister_client(&sa_client);
- destroy_workqueue(inform_wq);
-}
diff --git a/sys/ofed/drivers/infiniband/core/packer.c b/sys/ofed/drivers/infiniband/core/packer.c
index 019bd4b..9f42595 100644
--- a/sys/ofed/drivers/infiniband/core/packer.c
+++ b/sys/ofed/drivers/infiniband/core/packer.c
@@ -31,6 +31,7 @@
* SOFTWARE.
*/
+#include <linux/module.h>
#include <linux/string.h>
#include <rdma/ib_pack.h>
diff --git a/sys/ofed/drivers/infiniband/core/peer_mem.c b/sys/ofed/drivers/infiniband/core/peer_mem.c
new file mode 100644
index 0000000..cd716a4
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/peer_mem.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_peer_mem.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+static DEFINE_MUTEX(peer_memory_mutex);
+static LIST_HEAD(peer_memory_list);
+
+static int num_registered_peers;
+
+/* This code uses the sysfs which is not supporeted by the FreeBSD.
+ * * Will be added in future to the sysctl */
+
+#if 0
+static struct kobject *peers_kobj;
+static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj);
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
+
+ if (ib_peer_client) {
+ sprintf(buf, "%s\n", ib_peer_client->peer_mem->version);
+ return strlen(buf);
+ }
+ /* not found - nothing is return */
+ return 0;
+}
+
+static ssize_t num_alloc_mrs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
+
+ if (ib_peer_client) {
+ sprintf(buf, "%lu\n", ib_peer_client->stats.num_alloc_mrs);
+ return strlen(buf);
+ }
+ /* not found - nothing is return */
+ return 0;
+}
+
+static ssize_t num_reg_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
+
+ if (ib_peer_client) {
+ sprintf(buf, "%lu\n", ib_peer_client->stats.num_reg_pages);
+ return strlen(buf);
+ }
+ /* not found - nothing is return */
+ return 0;
+}
+
+static ssize_t num_dereg_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
+
+ if (ib_peer_client) {
+ sprintf(buf, "%lu\n", ib_peer_client->stats.num_dereg_pages);
+ return strlen(buf);
+ }
+ /* not found - nothing is return */
+ return 0;
+}
+
+static ssize_t num_free_callbacks_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
+
+ if (ib_peer_client) {
+ sprintf(buf, "%lu\n", ib_peer_client->stats.num_free_callbacks);
+ return strlen(buf);
+ }
+ /* not found - nothing is return */
+ return 0;
+}
+
+static struct kobj_attribute version_attr = __ATTR_RO(version);
+static struct kobj_attribute num_alloc_mrs = __ATTR_RO(num_alloc_mrs);
+static struct kobj_attribute num_reg_pages = __ATTR_RO(num_reg_pages);
+static struct kobj_attribute num_dereg_pages = __ATTR_RO(num_dereg_pages);
+static struct kobj_attribute num_free_callbacks = __ATTR_RO(num_free_callbacks);
+
+static struct attribute *peer_mem_attrs[] = {
+ &version_attr.attr,
+ &num_alloc_mrs.attr,
+ &num_reg_pages.attr,
+ &num_dereg_pages.attr,
+ &num_free_callbacks.attr,
+ NULL,
+};
+#endif
+
+#if 0
+static void destroy_peer_sysfs(struct ib_peer_memory_client *ib_peer_client)
+{
+ kobject_put(ib_peer_client->kobj);
+ if (!num_registered_peers)
+ kobject_put(peers_kobj);
+
+ return;
+}
+
+/* This code uses the sysfs which is not supporeted by the FreeBSD.
+ * Will be added in future to the sysctl */
+
+static int create_peer_sysfs(struct ib_peer_memory_client *ib_peer_client)
+{
+ int ret;
+
+ if (!num_registered_peers) {
+ /* creating under /sys/kernel/mm */
+ peers_kobj = kobject_create_and_add("memory_peers", mm_kobj);
+ if (!peers_kobj)
+ return -ENOMEM;
+ }
+
+ ib_peer_client->peer_mem_attr_group.attrs = peer_mem_attrs;
+ /* Dir alreday was created explicitly to get its kernel object for further usage */
+ ib_peer_client->peer_mem_attr_group.name = NULL;
+ ib_peer_client->kobj = kobject_create_and_add(ib_peer_client->peer_mem->name,
+ peers_kobj);
+
+ if (!ib_peer_client->kobj) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ /* Create the files associated with this kobject */
+ ret = sysfs_create_group(ib_peer_client->kobj,
+ &ib_peer_client->peer_mem_attr_group);
+ if (ret)
+ goto peer_free;
+
+ return 0;
+
+peer_free:
+ kobject_put(ib_peer_client->kobj);
+
+free:
+ if (!num_registered_peers)
+ kobject_put(peers_kobj);
+
+ return ret;
+}
+#endif
+
+static int ib_invalidate_peer_memory(void *reg_handle,
+ void *core_context)
+{
+ struct ib_peer_memory_client *ib_peer_client =
+ (struct ib_peer_memory_client *)reg_handle;
+ struct invalidation_ctx *invalidation_ctx;
+ struct core_ticket *core_ticket;
+ int need_unlock = 1;
+
+ mutex_lock(&ib_peer_client->lock);
+ ib_peer_client->stats.num_free_callbacks += 1;
+ core_ticket = ib_peer_search_context(ib_peer_client,
+ (unsigned long)core_context);
+ if (!core_ticket)
+ goto out;
+
+ invalidation_ctx = (struct invalidation_ctx *)core_ticket->context;
+ /* If context not ready yet mark to be invalidated */
+ if (!invalidation_ctx->func) {
+ invalidation_ctx->peer_invalidated = 1;
+ goto out;
+ }
+
+ invalidation_ctx->func(invalidation_ctx->cookie,
+ invalidation_ctx->umem, 0, 0);
+ if (invalidation_ctx->inflight_invalidation) {
+
+ /* init the completion to wait on before letting other thread to run */
+ init_completion(&invalidation_ctx->comp);
+ mutex_unlock(&ib_peer_client->lock);
+ need_unlock = 0;
+ wait_for_completion(&invalidation_ctx->comp);
+ }
+
+ kfree(invalidation_ctx);
+
+out:
+ if (need_unlock)
+ mutex_unlock(&ib_peer_client->lock);
+
+ return 0;
+}
+
+/* access to that peer client is under its lock - no extra lock is needed */
+unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client,
+ void *context)
+{
+ struct core_ticket *core_ticket = kzalloc(sizeof(*core_ticket), GFP_KERNEL);
+
+ ib_peer_client->last_ticket++;
+ core_ticket->context = context;
+ core_ticket->key = ib_peer_client->last_ticket;
+
+ list_add_tail(&core_ticket->ticket_list,
+ &ib_peer_client->core_ticket_list);
+
+ return core_ticket->key;
+}
+
+int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client,
+ unsigned long key)
+{
+ struct core_ticket *core_ticket, *tmp;
+
+ list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list,
+ ticket_list) {
+ if (core_ticket->key == key) {
+ list_del(&core_ticket->ticket_list);
+ kfree(core_ticket);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client,
+ unsigned long key)
+{
+ struct core_ticket *core_ticket, *tmp;
+ list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list,
+ ticket_list) {
+ if (core_ticket->key == key)
+ return core_ticket;
+ }
+
+ return NULL;
+}
+
+
+static int ib_memory_peer_check_mandatory(struct peer_memory_client
+ *peer_client)
+{
+#define PEER_MEM_MANDATORY_FUNC(x) {\
+ offsetof(struct peer_memory_client, x), #x }
+
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ PEER_MEM_MANDATORY_FUNC(acquire),
+ PEER_MEM_MANDATORY_FUNC(get_pages),
+ PEER_MEM_MANDATORY_FUNC(put_pages),
+ PEER_MEM_MANDATORY_FUNC(get_page_size),
+ PEER_MEM_MANDATORY_FUNC(dma_map),
+ PEER_MEM_MANDATORY_FUNC(dma_unmap)
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+ if (!*(void **) ((void *) peer_client + mandatory_table[i].offset)) {
+ printk(KERN_WARNING "Peer memory %s is missing mandatory function %s\n",
+ peer_client->name, mandatory_table[i].name);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+
+
+void *ib_register_peer_memory_client(struct peer_memory_client *peer_client,
+ invalidate_peer_memory *invalidate_callback)
+{
+ int ret = 0;
+ struct ib_peer_memory_client *ib_peer_client = NULL;
+
+ mutex_lock(&peer_memory_mutex);
+ if (ib_memory_peer_check_mandatory(peer_client)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL);
+ if (!ib_peer_client)
+ goto out;
+ ib_peer_client->peer_mem = peer_client;
+
+ INIT_LIST_HEAD(&ib_peer_client->core_ticket_list);
+ mutex_init(&ib_peer_client->lock);
+#ifdef __FreeBSD__
+ ib_peer_client->holdcount = 0;
+ ib_peer_client->needwakeup = 0;
+ cv_init(&ib_peer_client->peer_cv, "ibprcl");
+#else
+ ret = init_srcu_struct(&ib_peer_client->peer_srcu);
+ if (ret)
+ goto free;
+#endif
+#if 0
+ if (create_peer_sysfs(ib_peer_client))
+ goto free;
+#endif
+ *invalidate_callback = ib_invalidate_peer_memory;
+ list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list);
+ num_registered_peers++;
+ goto out;
+#if 0
+free:
+ kfree(ib_peer_client);
+ ib_peer_client = NULL;
+#endif
+out:
+ mutex_unlock(&peer_memory_mutex);
+ return ib_peer_client;
+}
+EXPORT_SYMBOL(ib_register_peer_memory_client);
+
+void ib_unregister_peer_memory_client(void *reg_handle)
+{
+ struct ib_peer_memory_client *ib_peer_client =
+ (struct ib_peer_memory_client *)reg_handle;
+
+ mutex_lock(&peer_memory_mutex);
+ /* remove from list to prevent future core clients usage as it goes down */
+ list_del(&ib_peer_client->core_peer_list);
+#ifdef __FreeBSD__
+ while (ib_peer_client->holdcount != 0) {
+ ib_peer_client->needwakeup = 1;
+ cv_wait(&ib_peer_client->peer_cv, &peer_memory_mutex.sx);
+ }
+ cv_destroy(&ib_peer_client->peer_cv);
+#else
+ mutex_unlock(&peer_memory_mutex);
+ /* peer memory can't go down while there are active clients */
+ synchronize_srcu(&ib_peer_client->peer_srcu);
+ cleanup_srcu_struct(&ib_peer_client->peer_srcu);
+ mutex_lock(&peer_memory_mutex);
+#endif
+ num_registered_peers--;
+/* This code uses the sysfs which is not supporeted by the FreeBSD.
+ * Will be added in future to the sysctl */
+#if 0
+ destroy_peer_sysfs(ib_peer_client);
+#endif
+ mutex_unlock(&peer_memory_mutex);
+
+ kfree(ib_peer_client);
+}
+EXPORT_SYMBOL(ib_unregister_peer_memory_client);
+
+/* This code uses the sysfs which is not supporeted by the FreeBSD.
+ * Will be added in future to the sysctl */
+
+#if 0
+static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj)
+{
+ struct ib_peer_memory_client *ib_peer_client;
+
+ mutex_lock(&peer_memory_mutex);
+ list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) {
+ if (ib_peer_client->kobj == kobj)
+ goto found;
+ }
+
+ ib_peer_client = NULL;
+
+found:
+
+ mutex_unlock(&peer_memory_mutex);
+ return ib_peer_client;
+}
+#endif
+
+struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr,
+ size_t size, void **peer_client_context,
+ int *srcu_key)
+{
+ struct ib_peer_memory_client *ib_peer_client;
+ int ret;
+
+ mutex_lock(&peer_memory_mutex);
+ list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) {
+ ret = ib_peer_client->peer_mem->acquire(addr, size,
+ context->peer_mem_private_data,
+ context->peer_mem_name,
+ peer_client_context);
+ if (ret == 1)
+ goto found;
+ }
+
+ ib_peer_client = NULL;
+
+found:
+ if (ib_peer_client) {
+#ifdef __FreeBSD__
+ ib_peer_client->holdcount++;
+#else
+ *srcu_key = srcu_read_lock(&ib_peer_client->peer_srcu);
+#endif
+ }
+
+ mutex_unlock(&peer_memory_mutex);
+ return ib_peer_client;
+
+}
+EXPORT_SYMBOL(ib_get_peer_client);
+
+void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
+ void *peer_client_context,
+ int srcu_key)
+{
+
+ if (ib_peer_client->peer_mem->release)
+ ib_peer_client->peer_mem->release(peer_client_context);
+
+#ifdef __FreeBSD__
+ ib_peer_client->holdcount--;
+ if (ib_peer_client->holdcount == 0 && ib_peer_client->needwakeup) {
+ cv_signal(&ib_peer_client->peer_cv);
+ }
+#else
+ srcu_read_unlock(&ib_peer_client->peer_srcu, srcu_key);
+#endif
+ return;
+}
+EXPORT_SYMBOL(ib_put_peer_client);
+
diff --git a/sys/ofed/drivers/infiniband/core/sa.h b/sys/ofed/drivers/infiniband/core/sa.h
index b8abdd7..b1d4bbf 100644
--- a/sys/ofed/drivers/infiniband/core/sa.h
+++ b/sys/ofed/drivers/infiniband/core/sa.h
@@ -48,29 +48,6 @@ static inline void ib_sa_client_put(struct ib_sa_client *client)
complete(&client->comp);
}
-int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
- ib_sa_comp_mask selector_mask,
- ib_sa_comp_mask value_mask,
- u8 selector, u8 src_value, u8 dst_value);
-
-int ib_sa_pack_attr(void *dst, void *src, int attr_id);
-
-int ib_sa_unpack_attr(void *dst, void *src, int attr_id);
-
-int ib_sa_path_rec_query(struct ib_sa_client *client,
- struct ib_device *device, u8 port_num,
- struct ib_sa_path_rec *rec,
- ib_sa_comp_mask comp_mask,
- int timeout_ms, gfp_t gfp_mask,
- void (*callback)(int status,
- struct ib_sa_path_rec *resp,
- void *context),
- void *context,
- struct ib_sa_query **sa_query);
-
-int sa_db_init(void);
-void sa_db_cleanup(void);
-
int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
struct ib_device *device, u8 port_num,
u8 method,
@@ -86,20 +63,4 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
int mcast_init(void);
void mcast_cleanup(void);
-int ib_sa_informinfo_query(struct ib_sa_client *client,
- struct ib_device *device, u8 port_num,
- struct ib_sa_inform *rec,
- int timeout_ms, gfp_t gfp_mask,
- void (*callback)(int status,
- struct ib_sa_inform *resp,
- void *context),
- void *context,
- struct ib_sa_query **sa_query);
-
-int notice_dispatch(struct ib_device *device, u8 port_num,
- struct ib_sa_notice *notice);
-
-int notice_init(void);
-void notice_cleanup(void);
-
#endif /* SA_H */
diff --git a/sys/ofed/drivers/infiniband/core/sa_query.c b/sys/ofed/drivers/infiniband/core/sa_query.c
index 9c6b4f7..a0c04f5 100644
--- a/sys/ofed/drivers/infiniband/core/sa_query.c
+++ b/sys/ofed/drivers/infiniband/core/sa_query.c
@@ -59,12 +59,10 @@ struct ib_sa_sm_ah {
struct ib_sa_port {
struct ib_mad_agent *agent;
- struct ib_mad_agent *notice_agent;
struct ib_sa_sm_ah *sm_ah;
struct work_struct update_task;
spinlock_t ah_lock;
u8 port_num;
- struct ib_device *device;
};
struct ib_sa_device {
@@ -95,14 +93,14 @@ struct ib_sa_path_query {
struct ib_sa_query sa_query;
};
-struct ib_sa_mcmember_query {
- void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+struct ib_sa_guidinfo_query {
+ void (*callback)(int, struct ib_sa_guidinfo_rec *, void *);
void *context;
struct ib_sa_query sa_query;
};
-struct ib_sa_inform_query {
- void (*callback)(int, struct ib_sa_inform *, void *);
+struct ib_sa_mcmember_query {
+ void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
void *context;
struct ib_sa_query sa_query;
};
@@ -116,10 +114,10 @@ static struct ib_client sa_client = {
.remove = ib_sa_remove_one
};
-static spinlock_t idr_lock;
+static DEFINE_SPINLOCK(idr_lock);
static DEFINE_IDR(query_idr);
-static spinlock_t tid_lock;
+static DEFINE_SPINLOCK(tid_lock);
static u32 tid;
#define PATH_REC_FIELD(field) \
@@ -354,162 +352,34 @@ static const struct ib_field service_rec_table[] = {
.size_bits = 2*64 },
};
-#define INFORM_FIELD(field) \
- .struct_offset_bytes = offsetof(struct ib_sa_inform, field), \
- .struct_size_bytes = sizeof ((struct ib_sa_inform *) 0)->field, \
- .field_name = "sa_inform:" #field
+#define GUIDINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \
+ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \
+ .field_name = "sa_guidinfo_rec:" #field
-static const struct ib_field inform_table[] = {
- { INFORM_FIELD(gid),
+static const struct ib_field guidinfo_rec_table[] = {
+ { GUIDINFO_REC_FIELD(lid),
.offset_words = 0,
.offset_bits = 0,
- .size_bits = 128 },
- { INFORM_FIELD(lid_range_begin),
- .offset_words = 4,
- .offset_bits = 0,
- .size_bits = 16 },
- { INFORM_FIELD(lid_range_end),
- .offset_words = 4,
- .offset_bits = 16,
- .size_bits = 16 },
- { RESERVED,
- .offset_words = 5,
- .offset_bits = 0,
.size_bits = 16 },
- { INFORM_FIELD(is_generic),
- .offset_words = 5,
+ { GUIDINFO_REC_FIELD(block_num),
+ .offset_words = 0,
.offset_bits = 16,
.size_bits = 8 },
- { INFORM_FIELD(subscribe),
- .offset_words = 5,
- .offset_bits = 24,
- .size_bits = 8 },
- { INFORM_FIELD(type),
- .offset_words = 6,
- .offset_bits = 0,
- .size_bits = 16 },
- { INFORM_FIELD(trap.generic.trap_num),
- .offset_words = 6,
- .offset_bits = 16,
- .size_bits = 16 },
- { INFORM_FIELD(trap.generic.qpn),
- .offset_words = 7,
- .offset_bits = 0,
- .size_bits = 24 },
- { RESERVED,
- .offset_words = 7,
+ { GUIDINFO_REC_FIELD(res1),
+ .offset_words = 0,
.offset_bits = 24,
- .size_bits = 3 },
- { INFORM_FIELD(trap.generic.resp_time),
- .offset_words = 7,
- .offset_bits = 27,
- .size_bits = 5 },
- { RESERVED,
- .offset_words = 8,
- .offset_bits = 0,
.size_bits = 8 },
- { INFORM_FIELD(trap.generic.producer_type),
- .offset_words = 8,
- .offset_bits = 8,
- .size_bits = 24 },
-};
-
-#define NOTICE_FIELD(field) \
- .struct_offset_bytes = offsetof(struct ib_sa_notice, field), \
- .struct_size_bytes = sizeof ((struct ib_sa_notice *) 0)->field, \
- .field_name = "sa_notice:" #field
-
-static const struct ib_field notice_table[] = {
- { NOTICE_FIELD(is_generic),
- .offset_words = 0,
- .offset_bits = 0,
- .size_bits = 1 },
- { NOTICE_FIELD(type),
- .offset_words = 0,
- .offset_bits = 1,
- .size_bits = 7 },
- { NOTICE_FIELD(trap.generic.producer_type),
- .offset_words = 0,
- .offset_bits = 8,
- .size_bits = 24 },
- { NOTICE_FIELD(trap.generic.trap_num),
+ { GUIDINFO_REC_FIELD(res2),
.offset_words = 1,
.offset_bits = 0,
- .size_bits = 16 },
- { NOTICE_FIELD(issuer_lid),
- .offset_words = 1,
- .offset_bits = 16,
- .size_bits = 16 },
- { NOTICE_FIELD(notice_toggle),
- .offset_words = 2,
- .offset_bits = 0,
- .size_bits = 1 },
- { NOTICE_FIELD(notice_count),
- .offset_words = 2,
- .offset_bits = 1,
- .size_bits = 15 },
- { NOTICE_FIELD(data_details),
+ .size_bits = 32 },
+ { GUIDINFO_REC_FIELD(guid_info_list),
.offset_words = 2,
- .offset_bits = 16,
- .size_bits = 432 },
- { NOTICE_FIELD(issuer_gid),
- .offset_words = 16,
.offset_bits = 0,
- .size_bits = 128 },
+ .size_bits = 512 },
};
-int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
- ib_sa_comp_mask selector_mask,
- ib_sa_comp_mask value_mask,
- u8 selector, u8 src_value, u8 dst_value)
-{
- int err;
-
- if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
- return 0;
-
- switch (selector) {
- case IB_SA_GT:
- err = (src_value <= dst_value);
- break;
- case IB_SA_LT:
- err = (src_value >= dst_value);
- break;
- case IB_SA_EQ:
- err = (src_value != dst_value);
- break;
- default:
- err = 0;
- break;
- }
-
- return err;
-}
-
-int ib_sa_pack_attr(void *dst, void *src, int attr_id)
-{
- switch (attr_id) {
- case IB_SA_ATTR_PATH_REC:
- ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-int ib_sa_unpack_attr(void *dst, void *src, int attr_id)
-{
- switch (attr_id) {
- case IB_SA_ATTR_PATH_REC:
- ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -588,7 +458,7 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
port->sm_ah = NULL;
spin_unlock_irqrestore(&port->ah_lock, flags);
- schedule_work(&sa_dev->port[event->element.port_num -
+ queue_work(ib_wq, &sa_dev->port[event->element.port_num -
sa_dev->start_port].update_task);
}
}
@@ -685,6 +555,14 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
ah_attr->grh.hop_limit = rec->hop_limit;
ah_attr->grh.traffic_class = rec->traffic_class;
}
+ if (force_grh) {
+ memcpy(ah_attr->dmac, rec->dmac, 6);
+ ah_attr->vlan_id = rec->vlan_id;
+ } else {
+ memset(ah_attr->dmac, 0, 6);
+ ah_attr->vlan_id = 0xffff;
+ }
+
return 0;
}
EXPORT_SYMBOL(ib_init_ah_from_path);
@@ -791,6 +669,10 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
mad->data, &rec);
+ rec.vlan_id = 0xffff;
+ memset(rec.dmac, 0, ETH_ALEN);
+ memset(rec.smac, 0, ETH_ALEN);
+
query->callback(status, &rec, query->context);
} else
query->callback(status, NULL, query->context);
@@ -801,7 +683,33 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
}
-int ib_sa_path_rec_query(struct ib_sa_client *client,
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
struct ib_device *device, u8 port_num,
struct ib_sa_path_rec *rec,
ib_sa_comp_mask comp_mask,
@@ -867,6 +775,7 @@ err1:
kfree(query);
return ret;
}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
int status,
@@ -1082,26 +991,27 @@ err1:
return ret;
}
-static void ib_sa_inform_callback(struct ib_sa_query *sa_query,
+/* Support GuidInfoRecord */
+static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
int status,
struct ib_sa_mad *mad)
{
- struct ib_sa_inform_query *query =
- container_of(sa_query, struct ib_sa_inform_query, sa_query);
+ struct ib_sa_guidinfo_query *query =
+ container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
if (mad) {
- struct ib_sa_inform rec;
+ struct ib_sa_guidinfo_rec rec;
- ib_unpack(inform_table, ARRAY_SIZE(inform_table),
+ ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table),
mad->data, &rec);
query->callback(status, &rec, query->context);
} else
query->callback(status, NULL, query->context);
}
-static void ib_sa_inform_release(struct ib_sa_query *sa_query)
+static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
{
- kfree(container_of(sa_query, struct ib_sa_inform_query, sa_query));
+ kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query));
}
int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
@@ -1115,52 +1025,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
void *context,
struct ib_sa_query **sa_query)
{
- // stub function -
- // called originally from mad.c under mlx4_ib_init_sriov()
- // which calls mlx4_ib_init_alias_guid_service() in alias_GUID.c
- // which goes down to this function
-
- printk("ERROR: function should be called only in SRIOV flow!!!");
-
- return 0;
-}
-
-/**
- * ib_sa_informinfo_query - Start an InformInfo registration.
- * @client:SA client
- * @device:device to send query on
- * @port_num: port number to send query on
- * @rec:Inform record to send in query
- * @timeout_ms:time to wait for response
- * @gfp_mask:GFP mask to use for internal allocations
- * @callback:function called when notice handler registration completes,
- * times out or is canceled
- * @context:opaque user context passed to callback
- * @sa_query:query context, used to cancel query
- *
- * This function sends inform info to register with SA to receive
- * in-service notice.
- * The callback function will be called when the query completes (or
- * fails); status is 0 for a successful response, -EINTR if the query
- * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
- * occurred sending the query. The resp parameter of the callback is
- * only valid if status is 0.
- *
- * If the return value of ib_sa_inform_query() is negative, it is an
- * error code. Otherwise it is a query ID that can be used to cancel
- * the query.
- */
-int ib_sa_informinfo_query(struct ib_sa_client *client,
- struct ib_device *device, u8 port_num,
- struct ib_sa_inform *rec,
- int timeout_ms, gfp_t gfp_mask,
- void (*callback)(int status,
- struct ib_sa_inform *resp,
- void *context),
- void *context,
- struct ib_sa_query **sa_query)
-{
- struct ib_sa_inform_query *query;
+ struct ib_sa_guidinfo_query *query;
struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
struct ib_sa_port *port;
struct ib_mad_agent *agent;
@@ -1170,6 +1035,12 @@ int ib_sa_informinfo_query(struct ib_sa_client *client,
if (!sa_dev)
return -ENODEV;
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_SA_METHOD_DELETE) {
+ return -EINVAL;
+ }
+
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
@@ -1190,15 +1061,18 @@ int ib_sa_informinfo_query(struct ib_sa_client *client,
mad = query->sa_query.mad_buf->mad;
init_mad(mad, agent);
- query->sa_query.callback = callback ? ib_sa_inform_callback : NULL;
- query->sa_query.release = ib_sa_inform_release;
- query->sa_query.port = port;
- mad->mad_hdr.method = IB_MGMT_METHOD_SET;
- mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_INFORM_INFO);
+ query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL;
+ query->sa_query.release = ib_sa_guidinfo_rec_release;
- ib_pack(inform_table, ARRAY_SIZE(inform_table), rec, mad->data);
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec,
+ mad->data);
*sa_query = &query->sa_query;
+
ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
if (ret < 0)
goto err2;
@@ -1209,49 +1083,12 @@ err2:
*sa_query = NULL;
ib_sa_client_put(query->sa_query.client);
free_mad(&query->sa_query);
+
err1:
kfree(query);
return ret;
}
-
-static void ib_sa_notice_resp(struct ib_sa_port *port,
- struct ib_mad_recv_wc *mad_recv_wc)
-{
- struct ib_mad_send_buf *mad_buf;
- struct ib_sa_mad *mad;
- int ret;
- unsigned long flags;
-
- mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0,
- IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
- GFP_KERNEL);
- if (IS_ERR(mad_buf))
- return;
-
- mad = mad_buf->mad;
- memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad);
- mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP;
-
- spin_lock_irqsave(&port->ah_lock, flags);
- if (!port->sm_ah) {
- spin_unlock_irqrestore(&port->ah_lock, flags);
- ib_free_send_mad(mad_buf);
- return;
- }
- kref_get(&port->sm_ah->ref);
- mad_buf->context[0] = &port->sm_ah->ref;
- mad_buf->ah = port->sm_ah->ah;
- spin_unlock_irqrestore(&port->ah_lock, flags);
-
- ret = ib_post_send_mad(mad_buf, NULL);
- if (ret)
- goto err;
-
- return;
-err:
- kref_put(mad_buf->context[0], free_sm_ah);
- ib_free_send_mad(mad_buf);
-}
+EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
static void send_handler(struct ib_mad_agent *agent,
struct ib_mad_send_wc *mad_send_wc)
@@ -1306,36 +1143,9 @@ static void recv_handler(struct ib_mad_agent *mad_agent,
ib_free_recv_mad(mad_recv_wc);
}
-static void notice_resp_handler(struct ib_mad_agent *agent,
- struct ib_mad_send_wc *mad_send_wc)
-{
- kref_put(mad_send_wc->send_buf->context[0], free_sm_ah);
- ib_free_send_mad(mad_send_wc->send_buf);
-}
-
-static void notice_handler(struct ib_mad_agent *mad_agent,
- struct ib_mad_recv_wc *mad_recv_wc)
-{
- struct ib_sa_port *port;
- struct ib_sa_mad *mad;
- struct ib_sa_notice notice;
-
- port = mad_agent->context;
- mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
- ib_unpack(notice_table, ARRAY_SIZE(notice_table), mad->data, &notice);
-
- if (!notice_dispatch(port->device, port->port_num, &notice))
- ib_sa_notice_resp(port, mad_recv_wc);
- ib_free_recv_mad(mad_recv_wc);
-}
-
static void ib_sa_add_one(struct ib_device *device)
{
struct ib_sa_device *sa_dev;
- struct ib_mad_reg_req reg_req = {
- .mgmt_class = IB_MGMT_CLASS_SUBN_ADM,
- .mgmt_class_version = 2
- };
int s, e, i;
if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
@@ -1372,16 +1182,6 @@ static void ib_sa_add_one(struct ib_device *device)
if (IS_ERR(sa_dev->port[i].agent))
goto err;
- sa_dev->port[i].device = device;
- set_bit(IB_MGMT_METHOD_REPORT, reg_req.method_mask);
- sa_dev->port[i].notice_agent =
- ib_register_mad_agent(device, i + s, IB_QPT_GSI,
- &reg_req, 0, notice_resp_handler,
- notice_handler, &sa_dev->port[i]);
-
- if (IS_ERR(sa_dev->port[i].notice_agent))
- goto err;
-
INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
}
@@ -1396,7 +1196,7 @@ static void ib_sa_add_one(struct ib_device *device)
INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
if (ib_register_event_handler(&sa_dev->event_handler))
- goto err;
+ goto reg_err;
for (i = 0; i <= e - s; ++i)
if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
@@ -1404,14 +1204,14 @@ static void ib_sa_add_one(struct ib_device *device)
return;
+reg_err:
+ ib_set_client_data(device, &sa_client, NULL);
+ i = e - s;
err:
- while (--i >= 0)
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
- if (!IS_ERR(sa_dev->port[i].notice_agent))
- ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
- if (!IS_ERR(sa_dev->port[i].agent))
+ for (; i >= 0; --i)
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND &&
+ !IS_ERR(sa_dev->port[i].agent))
ib_unregister_mad_agent(sa_dev->port[i].agent);
- }
kfree(sa_dev);
@@ -1428,11 +1228,10 @@ static void ib_sa_remove_one(struct ib_device *device)
ib_unregister_event_handler(&sa_dev->event_handler);
- flush_scheduled_work();
+ flush_workqueue(ib_wq);
for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
- ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
ib_unregister_mad_agent(sa_dev->port[i].agent);
if (sa_dev->port[i].sm_ah)
kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
@@ -1447,9 +1246,6 @@ static int __init ib_sa_init(void)
{
int ret;
- spin_lock_init(&idr_lock);
- spin_lock_init(&tid_lock);
-
get_random_bytes(&tid, sizeof tid);
ret = ib_register_client(&sa_client);
@@ -1464,23 +1260,7 @@ static int __init ib_sa_init(void)
goto err2;
}
- ret = notice_init();
- if (ret) {
- printk(KERN_ERR "Couldn't initialize notice handling\n");
- goto err3;
- }
-
- ret = sa_db_init();
- if (ret) {
- printk(KERN_ERR "Couldn't initialize local SA\n");
- goto err4;
- }
-
return 0;
-err4:
- notice_cleanup();
-err3:
- mcast_cleanup();
err2:
ib_unregister_client(&sa_client);
err1:
@@ -1489,9 +1269,7 @@ err1:
static void __exit ib_sa_cleanup(void)
{
- sa_db_cleanup();
mcast_cleanup();
- notice_cleanup();
ib_unregister_client(&sa_client);
idr_destroy(&query_idr);
}
diff --git a/sys/ofed/drivers/infiniband/core/smi.c b/sys/ofed/drivers/infiniband/core/smi.c
index 8723675..5855e44 100644
--- a/sys/ofed/drivers/infiniband/core/smi.c
+++ b/sys/ofed/drivers/infiniband/core/smi.c
@@ -52,6 +52,10 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
hop_cnt = smp->hop_cnt;
/* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
if (!ib_get_smp_direction(smp)) {
/* C14-9:1 */
if (hop_cnt && hop_ptr == 0) {
@@ -133,6 +137,10 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
hop_cnt = smp->hop_cnt;
/* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
if (!ib_get_smp_direction(smp)) {
/* C14-9:1 -- sender should have incremented hop_ptr */
if (hop_cnt && hop_ptr == 0)
diff --git a/sys/ofed/drivers/infiniband/core/sysfs.c b/sys/ofed/drivers/infiniband/core/sysfs.c
index 4cd5560..6bcbfb9 100644
--- a/sys/ofed/drivers/infiniband/core/sysfs.c
+++ b/sys/ofed/drivers/infiniband/core/sysfs.c
@@ -37,6 +37,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/fs.h>
+#include <linux/printk.h>
#include <rdma/ib_mad.h>
#include <rdma/ib_pma.h>
@@ -105,7 +106,7 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
return ret;
return sprintf(buf, "%d: %s\n", attr.state,
- attr.state < ARRAY_SIZE(state_name) ?
+ attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
state_name[attr.state] : "UNKNOWN");
}
@@ -180,19 +181,18 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
{
struct ib_port_attr attr;
char *speed = "";
- int rate;
+ int rate; /* in deci-Gb/sec */
ssize_t ret;
ret = ib_query_port(p->ibdev, p->port_num, &attr);
if (ret)
return ret;
- switch (attr.active_speed) {
- case 2: speed = " DDR"; break;
- case 4: speed = " QDR"; break;
- }
+ ib_active_speed_enum_to_rate(attr.active_speed,
+ &rate,
+ &speed);
- rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
+ rate *= ib_width_enum_to_int(attr.active_width);
if (rate < 0)
return -EINVAL;
@@ -229,9 +229,11 @@ static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
{
switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
case IB_LINK_LAYER_INFINIBAND:
- return sprintf(buf, "%s\n", "IB");
+ return sprintf(buf, "%s\n", "InfiniBand");
case IB_LINK_LAYER_ETHERNET:
return sprintf(buf, "%s\n", "Ethernet");
+ case IB_LINK_LAYER_SCIF:
+ return sprintf(buf, "%s\n", "SCIF");
default:
return sprintf(buf, "%s\n", "Unknown");
}
@@ -267,16 +269,12 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
container_of(attr, struct port_table_attribute, attr);
union ib_gid gid;
ssize_t ret;
- u16 *raw;
ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
if (ret)
return ret;
- raw = (u16 *)gid.raw;
- return sprintf(buf, "%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x\n",
- htons(raw[0]), htons(raw[1]), htons(raw[2]), htons(raw[3]),
- htons(raw[4]), htons(raw[5]), htons(raw[6]), htons(raw[7]));
+ return sprintf(buf, GID_PRINT_FMT"\n",GID_PRINT_ARGS(gid.raw));
}
static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
@@ -351,8 +349,8 @@ static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr,
be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
break;
case 64:
- ret = sprintf(buf, "%llu\n", (unsigned long long)
- be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8)));
+ ret = sprintf(buf, "%llu\n",
+ (unsigned long long)be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8)));
break;
default:
ret = 0;
@@ -536,6 +534,7 @@ alloc_group_attrs(ssize_t (*show)(struct ib_port *,
element->attr.attr.mode = S_IRUGO;
element->attr.show = show;
element->index = i;
+ sysfs_attr_init(&element->attr.attr);
tab_attr[i] = &element->attr.attr;
}
@@ -570,7 +569,7 @@ static int add_port(struct ib_device *device, int port_num,
p->port_num = port_num;
ret = kobject_init_and_add(&p->kobj, &port_type,
- kobject_get(device->ports_parent),
+ device->ports_parent,
"%d", port_num);
if (ret)
goto err_put;
@@ -609,7 +608,6 @@ static int add_port(struct ib_device *device, int port_num,
}
list_add_tail(&p->kobj.entry, &device->port_list);
-
#ifdef __linux__
kobject_uevent(&p->kobj, KOBJ_ADD);
#endif
@@ -655,6 +653,7 @@ static ssize_t show_node_type(struct device *device,
case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type);
case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+ case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type);
default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
}
}
@@ -716,16 +715,75 @@ static ssize_t set_node_desc(struct device *device,
return count;
}
+static ssize_t show_cmd_perf(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%d\n", dev->cmd_perf);
+}
+
+static ssize_t set_cmd_perf(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ u32 val;
+
+ if (sscanf(buf, "0x%x", &val) != 1)
+ return -EINVAL;
+
+ dev->cmd_perf = val;
+
+ return count;
+}
+
+static ssize_t show_cmd_avg(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%llu\n", (unsigned long long)dev->cmd_avg);
+}
+
+static ssize_t set_cmd_avg(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ spin_lock(&dev->cmd_perf_lock);
+ dev->cmd_avg = 0;
+ dev->cmd_n = 0;
+ spin_unlock(&dev->cmd_perf_lock);
+
+ return count;
+}
+
+static ssize_t show_cmd_n(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%d\n", dev->cmd_n);
+}
+
static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+static DEVICE_ATTR(cmd_perf, S_IRUGO | S_IWUSR, show_cmd_perf, set_cmd_perf);
+static DEVICE_ATTR(cmd_avg, S_IRUGO | S_IWUSR, show_cmd_avg, set_cmd_avg);
+static DEVICE_ATTR(cmd_n, S_IRUGO, show_cmd_n, NULL);
static struct device_attribute *ib_class_attributes[] = {
&dev_attr_node_type,
&dev_attr_sys_image_guid,
&dev_attr_node_guid,
- &dev_attr_node_desc
+ &dev_attr_node_desc,
+ &dev_attr_cmd_perf,
+ &dev_attr_cmd_avg,
+ &dev_attr_cmd_n,
};
static struct class ib_class = {
@@ -851,7 +909,8 @@ static struct attribute_group iw_stats_group = {
};
int ib_device_register_sysfs(struct ib_device *device,
- int (*port_callback)(struct ib_device *, u8, struct kobject *))
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
{
struct device *class_dev = &device->dev;
int ret;
@@ -874,8 +933,7 @@ int ib_device_register_sysfs(struct ib_device *device,
goto err_unregister;
}
- device->ports_parent = kobject_create_and_add("ports",
- kobject_get(&class_dev->kobj));
+ device->ports_parent = kobject_create_and_add("ports",&class_dev->kobj);
if (!device->ports_parent) {
ret = -ENOMEM;
goto err_put;
@@ -919,6 +977,11 @@ err_put:
kobject_put(&class_dev->kobj);
err_unregister:
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+ device_remove_file(class_dev, ib_class_attributes[i]);
+ }
+
device_unregister(class_dev);
err:
@@ -927,15 +990,16 @@ err:
void ib_device_unregister_sysfs(struct ib_device *device)
{
+ int i;
struct kobject *p, *t;
struct ib_port *port;
- int i;
+ struct device *class_dev = &device->dev;
/* Hold kobject until ib_dealloc_device() */
kobject_get(&device->dev.kobj);
for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
- device_remove_file(&device->dev, ib_class_attributes[i]);
+ device_remove_file(class_dev, ib_class_attributes[i]);
}
list_for_each_entry_safe(p, t, &device->port_list, entry) {
@@ -960,22 +1024,3 @@ void ib_sysfs_cleanup(void)
{
class_unregister(&ib_class);
}
-
-/*int ib_sysfs_create_port_files(struct ib_device *device,
- int (*create)(struct ib_device *dev, u8 port_num,
- struct kobject *kobj))
-{
- struct kobject *p;
- struct ib_port *port;
- int ret = 0;
-
- list_for_each_entry(p, &device->port_list, entry) {
- port = container_of(p, struct ib_port, kobj);
- ret = create(device, port->port_num, &port->kobj);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL(ib_sysfs_create_port_files);*/
diff --git a/sys/ofed/drivers/infiniband/core/ucm.c b/sys/ofed/drivers/infiniband/core/ucm.c
index b912ebe..8f20e89 100644
--- a/sys/ofed/drivers/infiniband/core/ucm.c
+++ b/sys/ofed/drivers/infiniband/core/ucm.c
@@ -37,10 +37,12 @@
#include <linux/device.h>
#include <linux/err.h>
#include <linux/poll.h>
+#include <linux/sched.h>
#include <linux/file.h>
#include <linux/cdev.h>
#include <linux/idr.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include <linux/string.h>
#include <asm/uaccess.h>
@@ -396,7 +398,6 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file,
struct ib_ucm_event_get cmd;
struct ib_ucm_event *uevent;
int result = 0;
- DEFINE_WAIT(wait);
if (out_len < sizeof(struct ib_ucm_event_resp))
return -ENOSPC;
@@ -1123,7 +1124,7 @@ static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof(hdr)))
return -EFAULT;
- if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+ if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
return -EINVAL;
if (hdr.in + sizeof(hdr) > len)
@@ -1163,7 +1164,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp)
{
struct ib_ucm_file *file;
- file = kzalloc(sizeof(*file), GFP_KERNEL);
+ file = kmalloc(sizeof(*file), GFP_KERNEL);
if (!file)
return -ENOMEM;
@@ -1177,7 +1178,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp)
file->filp = filp;
file->device = container_of(inode->i_cdev->si_drv1, struct ib_ucm_device, cdev);
- return 0;
+ return nonseekable_open(inode, filp);
}
static int ib_ucm_close(struct inode *inode, struct file *filp)
@@ -1212,7 +1213,10 @@ static void ib_ucm_release_dev(struct device *dev)
ucm_dev = container_of(dev, struct ib_ucm_device, dev);
cdev_del(&ucm_dev->cdev);
+ if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
clear_bit(ucm_dev->devnum, dev_map);
+ else
+ clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
kfree(ucm_dev);
}
@@ -1222,6 +1226,7 @@ static const struct file_operations ucm_fops = {
.release = ib_ucm_close,
.write = ib_ucm_write,
.poll = ib_ucm_poll,
+ .llseek = no_llseek,
};
static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
@@ -1234,8 +1239,32 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
+static int find_overflow_devnum(void)
+{
+ int ret;
+
+ if (!overflow_maj) {
+ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
+ "infiniband_cm");
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't register dynamic device number\n");
+ return ret;
+ }
+ }
+
+ ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES);
+ if (ret >= IB_UCM_MAX_DEVICES)
+ return -1;
+
+ return ret;
+}
+
static void ib_ucm_add_one(struct ib_device *device)
{
+ int devnum;
+ dev_t base;
struct ib_ucm_device *ucm_dev;
if (!device->alloc_ucontext ||
@@ -1248,16 +1277,25 @@ static void ib_ucm_add_one(struct ib_device *device)
ucm_dev->ib_dev = device;
- ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
- if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES)
+ devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+ if (devnum >= IB_UCM_MAX_DEVICES) {
+ devnum = find_overflow_devnum();
+ if (devnum < 0)
goto err;
- set_bit(ucm_dev->devnum, dev_map);
+ ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
+ base = devnum + overflow_maj;
+ set_bit(devnum, overflow_map);
+ } else {
+ ucm_dev->devnum = devnum;
+ base = devnum + IB_UCM_BASE_DEV;
+ set_bit(devnum, dev_map);
+ }
cdev_init(&ucm_dev->cdev, &ucm_fops);
ucm_dev->cdev.owner = THIS_MODULE;
kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
- if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
+ if (cdev_add(&ucm_dev->cdev, base, 1))
goto err;
ucm_dev->dev.class = &cm_class;
@@ -1278,7 +1316,10 @@ err_dev:
device_unregister(&ucm_dev->dev);
err_cdev:
cdev_del(&ucm_dev->cdev);
- clear_bit(ucm_dev->devnum, dev_map);
+ if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+ clear_bit(devnum, dev_map);
+ else
+ clear_bit(devnum, overflow_map);
err:
kfree(ucm_dev);
return;
@@ -1298,6 +1339,7 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *att
{
return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION);
}
+
static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
static int __init ib_ucm_init(void)
@@ -1337,6 +1379,8 @@ static void __exit ib_ucm_cleanup(void)
ib_unregister_client(&ucm_client);
class_remove_file(&cm_class, &class_attr_abi_version);
unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+ if (overflow_maj)
+ unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
idr_destroy(&ctx_id_table);
}
diff --git a/sys/ofed/drivers/infiniband/core/ucma.c b/sys/ofed/drivers/infiniband/core/ucma.c
index 23cbf7b..5f73b40 100644
--- a/sys/ofed/drivers/infiniband/core/ucma.c
+++ b/sys/ofed/drivers/infiniband/core/ucma.c
@@ -34,10 +34,13 @@
#include <linux/file.h>
#include <linux/mutex.h>
#include <linux/poll.h>
+#include <linux/sched.h>
#include <linux/idr.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
#include <rdma/rdma_user_cm.h>
#include <rdma/ib_marshall.h>
@@ -48,9 +51,7 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
MODULE_LICENSE("Dual BSD/GPL");
-enum {
- UCMA_MAX_BACKLOG = 1024
-};
+static unsigned int max_backlog = 1024;
struct ucma_file {
struct mutex mut;
@@ -253,17 +254,17 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
if (!uevent)
return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+ mutex_lock(&ctx->file->mut);
uevent->cm_id = cm_id;
ucma_set_event_context(ctx, event, uevent);
uevent->resp.event = event->event;
uevent->resp.status = event->status;
- if (cm_id->ps == RDMA_PS_UDP || cm_id->ps == RDMA_PS_IPOIB)
+ if (cm_id->qp_type == IB_QPT_UD)
ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
else
ucma_copy_conn_event(&uevent->resp.param.conn,
&event->param.conn);
- mutex_lock(&ctx->file->mut);
if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
if (!ctx->backlog) {
ret = -ENOMEM;
@@ -298,7 +299,6 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
struct rdma_ucm_get_event cmd;
struct ucma_event *uevent;
int ret = 0;
- DEFINE_WAIT(wait);
if (out_len < sizeof uevent->resp)
return -ENOSPC;
@@ -332,6 +332,7 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
ctx->cm_id = uevent->cm_id;
ctx->cm_id->context = ctx;
uevent->resp.id = ctx->id;
+ ctx->cm_id->ucontext = ctx;
}
if (copy_to_user((void __user *)(unsigned long)cmd.response,
@@ -350,13 +351,31 @@ done:
return ret;
}
-static ssize_t ucma_create_id(struct ucma_file *file,
- const char __user *inbuf,
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+ switch (cmd->ps) {
+ case RDMA_PS_TCP:
+ *qp_type = IB_QPT_RC;
+ return 0;
+ case RDMA_PS_UDP:
+ case RDMA_PS_IPOIB:
+ *qp_type = IB_QPT_UD;
+ return 0;
+ case RDMA_PS_IB:
+ *qp_type = cmd->qp_type;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
int in_len, int out_len)
{
struct rdma_ucm_create_id cmd;
struct rdma_ucm_create_id_resp resp;
struct ucma_context *ctx;
+ enum ib_qp_type qp_type;
int ret;
if (out_len < sizeof(resp))
@@ -365,6 +384,10 @@ static ssize_t ucma_create_id(struct ucma_file *file,
if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
return -EFAULT;
+ ret = ucma_get_qp_type(&cmd, &qp_type);
+ if (ret)
+ return ret;
+
mutex_lock(&file->mut);
ctx = ucma_alloc_ctx(file);
mutex_unlock(&file->mut);
@@ -372,11 +395,12 @@ static ssize_t ucma_create_id(struct ucma_file *file,
return -ENOMEM;
ctx->uid = cmd.uid;
- ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps);
+ ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
if (IS_ERR(ctx->cm_id)) {
ret = PTR_ERR(ctx->cm_id);
goto err1;
}
+ ctx->cm_id->ucontext = ctx;
resp.id = ctx->id;
if (copy_to_user((void __user *)(unsigned long)cmd.response,
@@ -409,24 +433,6 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx)
mutex_unlock(&mut);
}
-static void ucma_cleanup_events(struct ucma_context *ctx)
-{
- struct ucma_event *uevent, *tmp;
-
- list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
- if (uevent->ctx != ctx)
- continue;
-
- list_del(&uevent->list);
-
- /* clear incoming connections. */
- if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
- rdma_destroy_id(uevent->cm_id);
-
- kfree(uevent);
- }
-}
-
static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
{
struct ucma_event *uevent, *tmp;
@@ -440,9 +446,16 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
}
}
+/*
+ * We cannot hold file->mut when calling rdma_destroy_id() or we can
+ * deadlock. We also acquire file->mut in ucma_event_handler(), and
+ * rdma_destroy_id() will wait until all callbacks have completed.
+ */
static int ucma_free_ctx(struct ucma_context *ctx)
{
int events_reported;
+ struct ucma_event *uevent, *tmp;
+ LIST_HEAD(list);
/* No new events will be generated after destroying the id. */
rdma_destroy_id(ctx->cm_id);
@@ -451,10 +464,20 @@ static int ucma_free_ctx(struct ucma_context *ctx)
/* Cleanup events not yet reported to the user. */
mutex_lock(&ctx->file->mut);
- ucma_cleanup_events(ctx);
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+ if (uevent->ctx == ctx)
+ list_move_tail(&uevent->list, &list);
+ }
list_del(&ctx->list);
mutex_unlock(&ctx->file->mut);
+ list_for_each_entry_safe(uevent, tmp, &list, list) {
+ list_del(&uevent->list);
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ rdma_destroy_id(uevent->cm_id);
+ kfree(uevent);
+ }
+
events_reported = ctx->events_reported;
kfree(ctx);
return events_reported;
@@ -586,24 +609,14 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
struct rdma_route *route)
{
- struct rdma_dev_addr *dev_addr;
- struct net_device *dev;
- u16 vid = 0;
resp->num_paths = route->num_paths;
switch (route->num_paths) {
case 0:
- dev_addr = &route->addr.dev_addr;
- dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
- if (dev) {
- vid = rdma_vlan_dev_vlan_id(dev);
- dev_put(dev);
- }
-
- iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid,
- dev_addr->dst_dev_addr, vid);
- iboe_addr_get_sgid(dev_addr,
- (union ib_gid *) &resp->ib_route[0].sgid);
+ rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
+ (union ib_gid *)&resp->ib_route[0].dgid);
+ rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
+ (union ib_gid *)&resp->ib_route[0].sgid);
resp->ib_route[0].pkey = cpu_to_be16(0xffff);
break;
case 2:
@@ -619,6 +632,16 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
}
}
+static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
+}
+
static ssize_t ucma_query_route(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
@@ -653,8 +676,10 @@ static ssize_t ucma_query_route(struct ucma_file *file,
resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
resp.port_num = ctx->cm_id->port_num;
- if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) {
- switch (rdma_port_get_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) {
+ switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ switch (rdma_port_get_link_layer(ctx->cm_id->device,
+ ctx->cm_id->port_num)) {
case IB_LINK_LAYER_INFINIBAND:
ucma_copy_ib_route(&resp, &ctx->cm_id->route);
break;
@@ -664,6 +689,12 @@ static ssize_t ucma_query_route(struct ucma_file *file,
default:
break;
}
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ucma_copy_iw_route(&resp, &ctx->cm_id->route);
+ break;
+ default:
+ break;
}
out:
@@ -727,8 +758,8 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ctx->backlog = cmd.backlog > 0 && cmd.backlog < UCMA_MAX_BACKLOG ?
- cmd.backlog : UCMA_MAX_BACKLOG;
+ ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
+ cmd.backlog : max_backlog;
ret = rdma_listen(ctx->cm_id, ctx->backlog);
ucma_put_ctx(ctx);
return ret;
@@ -750,9 +781,12 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
return PTR_ERR(ctx);
if (cmd.conn_param.valid) {
- ctx->uid = cmd.uid;
ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+ mutex_lock(&file->mut);
ret = rdma_accept(ctx->cm_id, &conn_param);
+ if (!ret)
+ ctx->uid = cmd.uid;
+ mutex_unlock(&file->mut);
} else
ret = rdma_accept(ctx->cm_id, NULL);
@@ -848,6 +882,20 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
}
rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
break;
+ case RDMA_OPTION_ID_REUSEADDR:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
+ case RDMA_OPTION_ID_AFONLY:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
default:
ret = -ENOSYS;
}
@@ -887,12 +935,22 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
void *optval, size_t optlen)
{
- int ret;
+ int ret = 0;
switch (optname) {
case RDMA_OPTION_IB_PATH:
ret = ucma_set_ib_path(ctx, optval, optlen);
break;
+
+ case RDMA_OPTION_IB_APM:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (*(u8 *)optval)
+ ret = rdma_enable_apm(ctx->cm_id, RDMA_ALT_PATH_BEST);
+ break;
+
default:
ret = -ENOSYS;
}
@@ -937,20 +995,21 @@ static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
optval = kmalloc(cmd.optlen, GFP_KERNEL);
if (!optval) {
ret = -ENOMEM;
- goto out1;
+ goto err_ucma_put_ctx;
}
- if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval,
+ if (copy_from_user(optval, (void __user *)(unsigned long)cmd.optval,
cmd.optlen)) {
ret = -EFAULT;
- goto out2;
+ goto err_kfree;
}
ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
cmd.optlen);
-out2:
+
+err_kfree:
kfree(optval);
-out1:
+err_ucma_put_ctx:
ucma_put_ctx(ctx);
return ret;
}
@@ -1121,7 +1180,7 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
struct rdma_ucm_migrate_id cmd;
struct rdma_ucm_migrate_resp resp;
struct ucma_context *ctx;
- struct file *filp;
+ struct fd f;
struct ucma_file *cur_file;
int ret = 0;
@@ -1129,12 +1188,12 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
return -EFAULT;
/* Get current fd to protect against it being closed */
- filp = fget(cmd.fd);
- if (!filp)
+ f = fdget(cmd.fd);
+ if (!f.file)
return -ENOENT;
/* Validate current fd and prevent destruction of id. */
- ctx = ucma_get_ctx(filp->private_data, cmd.id);
+ ctx = ucma_get_ctx(f.file->private_data, cmd.id);
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
goto file_put;
@@ -1168,7 +1227,7 @@ response:
ucma_put_ctx(ctx);
file_put:
- fput(filp);
+ fdput(f);
return ret;
}
@@ -1209,7 +1268,7 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof(hdr)))
return -EFAULT;
- if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+ if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
return -EINVAL;
if (hdr.in + sizeof(hdr) > len)
@@ -1261,7 +1320,8 @@ static int ucma_open(struct inode *inode, struct file *filp)
filp->private_data = file;
file->filp = filp;
- return 0;
+
+ return nonseekable_open(inode, filp);
}
static int ucma_close(struct inode *inode, struct file *filp)
@@ -1291,11 +1351,14 @@ static const struct file_operations ucma_fops = {
.release = ucma_close,
.write = ucma_write,
.poll = ucma_poll,
+ .llseek = no_llseek,
};
static struct miscdevice ucma_misc = {
.minor = MISC_DYNAMIC_MINOR,
.name = "rdma_cm",
+ .nodename = "infiniband/rdma_cm",
+ .mode = 0666,
.fops = &ucma_fops,
};
@@ -1318,10 +1381,11 @@ static int __init ucma_init(void)
ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
if (ret) {
printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
- goto err;
+ goto err1;
}
+
return 0;
-err:
+err1:
misc_deregister(&ucma_misc);
return ret;
}
diff --git a/sys/ofed/drivers/infiniband/core/ud_header.c b/sys/ofed/drivers/infiniband/core/ud_header.c
index 09fc1ff..051d3bd 100644
--- a/sys/ofed/drivers/infiniband/core/ud_header.c
+++ b/sys/ofed/drivers/infiniband/core/ud_header.c
@@ -33,6 +33,7 @@
#include <linux/errno.h>
#include <linux/string.h>
+#include <linux/module.h>
#include <linux/if_ether.h>
#include <rdma/ib_pack.h>
@@ -230,32 +231,28 @@ void ib_ud_header_init(int payload_bytes,
int immediate_present,
struct ib_ud_header *header)
{
- u16 packet_length = 0;
-
memset(header, 0, sizeof *header);
if (lrh_present) {
+ u16 packet_length = 0;
+
header->lrh.link_version = 0;
header->lrh.link_next_header =
grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
- packet_length = IB_LRH_BYTES;
+ packet_length = (IB_LRH_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ (grh_present ? IB_GRH_BYTES : 0) +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) / 4; /* round up */
+ header->lrh.packet_length = cpu_to_be16(packet_length);
}
- if (eth_present) {
- if (vlan_present) {
+ if (vlan_present)
header->eth.type = cpu_to_be16(ETH_P_8021Q);
- packet_length += IB_VLAN_BYTES;
- }
- packet_length += IB_ETH_BYTES;
- }
-
- packet_length += IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes +
- 4 + /* ICRC */
- 3; /* round up */
- packet_length /= 4;
if (grh_present) {
- packet_length += IB_GRH_BYTES / 4;
header->grh.ip_version = 6;
header->grh.payload_length =
cpu_to_be16((IB_BTH_BYTES +
@@ -266,9 +263,6 @@ void ib_ud_header_init(int payload_bytes,
header->grh.next_header = 0x1b;
}
- if (lrh_present)
- header->lrh.packet_length = cpu_to_be16(packet_length);
-
if (immediate_present)
header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
else
@@ -285,36 +279,6 @@ void ib_ud_header_init(int payload_bytes,
EXPORT_SYMBOL(ib_ud_header_init);
/**
- * ib_lrh_header_pack - Pack LRH header struct into wire format
- * @lrh:unpacked LRH header struct
- * @buf:Buffer to pack into
- *
- * ib_lrh_header_pack() packs the LRH header structure @lrh into
- * wire format in the buffer @buf.
- */
-int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf)
-{
- ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf);
- return 0;
-}
-EXPORT_SYMBOL(ib_lrh_header_pack);
-
-/**
- * ib_lrh_header_unpack - Unpack LRH structure from wire format
- * @lrh:unpacked LRH header struct
- * @buf:Buffer to pack into
- *
- * ib_lrh_header_unpack() unpacks the LRH header structure from
- * wire format (in buf) into @lrh.
- */
-int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh)
-{
- ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh);
- return 0;
-}
-EXPORT_SYMBOL(ib_lrh_header_unpack);
-
-/**
* ib_ud_header_pack - Pack UD header struct into wire format
* @header:UD header struct
* @buf:Buffer to pack into
@@ -337,14 +301,11 @@ int ib_ud_header_pack(struct ib_ud_header *header,
&header->eth, buf + len);
len += IB_ETH_BYTES;
}
-
-
if (header->vlan_present) {
ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
&header->vlan, buf + len);
len += IB_VLAN_BYTES;
}
-
if (header->grh_present) {
ib_pack(grh_table, ARRAY_SIZE(grh_table),
&header->grh, buf + len);
diff --git a/sys/ofed/drivers/infiniband/core/umem.c b/sys/ofed/drivers/infiniband/core/umem.c
index 7695a21..cdd2e67 100644
--- a/sys/ofed/drivers/infiniband/core/umem.c
+++ b/sys/ofed/drivers/infiniband/core/umem.c
@@ -35,109 +35,168 @@
#include <linux/mm.h>
#include <linux/dma-mapping.h>
#include <linux/sched.h>
-#ifdef __linux__
-#include <linux/hugetlb.h>
-#endif
#include <linux/dma-attrs.h>
-
+#include <linux/slab.h>
+#include <linux/module.h>
#include <sys/priv.h>
-#include <sys/resource.h>
#include <sys/resourcevar.h>
-
-#include <vm/vm.h>
-#include <vm/vm_map.h>
-#include <vm/vm_object.h>
#include <vm/vm_pageout.h>
-
+#include <vm/vm_map.h>
#include "uverbs.h"
-static int allow_weak_ordering;
-module_param(allow_weak_ordering, bool, 0444);
-MODULE_PARM_DESC(allow_weak_ordering, "Allow weak ordering for data registered memory");
+#define IB_UMEM_MAX_PAGE_CHUNK (PAGE_SIZE / sizeof (struct page *))
-#define IB_UMEM_MAX_PAGE_CHUNK \
- ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \
- ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \
- (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
-
-#ifdef __ia64__
-extern int dma_map_sg_hp_wa;
+static int allow_weak_ordering;
+module_param_named(weak_ordering, allow_weak_ordering, int, 0444);
+MODULE_PARM_DESC(weak_ordering, "Allow weak ordering for data registered memory");
-static int dma_map_sg_ia64(struct ib_device *ibdev,
- struct scatterlist *sg,
- int nents,
- enum dma_data_direction dir)
+static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem,
+ struct ib_umem *umem, unsigned long addr,
+ int dmasync, int invalidation_supported)
{
- int i, rc, j, lents = 0;
- struct device *dev;
-
- if (!dma_map_sg_hp_wa)
- return ib_dma_map_sg(ibdev, sg, nents, dir);
+ int ret;
+ const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
+ struct invalidation_ctx *invalidation_ctx = NULL;
- dev = ibdev->dma_device;
- for (i = 0; i < nents; ++i) {
- rc = dma_map_sg(dev, sg + i, 1, dir);
- if (rc <= 0) {
- for (j = 0; j < i; ++j)
- dma_unmap_sg(dev, sg + j, 1, dir);
+ umem->ib_peer_mem = ib_peer_mem;
+ if (invalidation_supported) {
+ invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL);
+ if (!invalidation_ctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ umem->invalidation_ctx = invalidation_ctx;
+ invalidation_ctx->umem = umem;
+ mutex_lock(&ib_peer_mem->lock);
+ invalidation_ctx->context_ticket =
+ ib_peer_insert_context(ib_peer_mem, invalidation_ctx);
+ /* unlock before calling get pages to prevent a dead-lock from the callback */
+ mutex_unlock(&ib_peer_mem->lock);
+ }
- return 0;
+ ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1,
+ &umem->sg_head,
+ umem->peer_mem_client_context,
+ invalidation_ctx ?
+ (void *)invalidation_ctx->context_ticket : NULL);
+
+ if (invalidation_ctx) {
+ /* taking the lock back, checking that wasn't invalidated at that time */
+ mutex_lock(&ib_peer_mem->lock);
+ if (invalidation_ctx->peer_invalidated) {
+ printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n");
+ ret = -EINVAL;
}
- lents += rc;
}
- return lents;
+ if (ret)
+ goto out;
+
+ umem->page_size = peer_mem->get_page_size
+ (umem->peer_mem_client_context);
+ if (umem->page_size <= 0)
+ goto put_pages;
+
+ umem->offset = addr & ((unsigned long)umem->page_size - 1);
+ ret = peer_mem->dma_map(&umem->sg_head,
+ umem->peer_mem_client_context,
+ umem->context->device->dma_device,
+ dmasync,
+ &umem->nmap);
+ if (ret)
+ goto put_pages;
+
+ ib_peer_mem->stats.num_reg_pages +=
+ umem->nmap * (umem->page_size >> PAGE_SHIFT);
+ ib_peer_mem->stats.num_alloc_mrs += 1;
+ return umem;
+
+put_pages:
+
+ peer_mem->put_pages(umem->peer_mem_client_context,
+ &umem->sg_head);
+out:
+ if (invalidation_ctx) {
+ ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
+ mutex_unlock(&umem->ib_peer_mem->lock);
+ kfree(invalidation_ctx);
+ }
+
+ ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
+ umem->peer_mem_srcu_key);
+ kfree(umem);
+ return ERR_PTR(ret);
}
-static void dma_unmap_sg_ia64(struct ib_device *ibdev,
- struct scatterlist *sg,
- int nents,
- enum dma_data_direction dir)
+static void peer_umem_release(struct ib_umem *umem)
{
- int i;
- struct device *dev;
-
- if (!dma_map_sg_hp_wa)
- return ib_dma_unmap_sg(ibdev, sg, nents, dir);
+ struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem;
+ const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
+ struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
+
+ if (invalidation_ctx) {
+
+ int peer_callback;
+ int inflight_invalidation;
+ /* If we are not under peer callback we must take the lock before removing
+ * core ticket from the tree and releasing its umem.
+ * It will let any inflight callbacks to be ended safely.
+ * If we are under peer callback or under error flow of reg_mr so that context
+ * wasn't activated yet lock was already taken.
+ */
+ if (invalidation_ctx->func && !invalidation_ctx->peer_callback)
+ mutex_lock(&ib_peer_mem->lock);
+ ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
+ /* make sure to check inflight flag after took the lock and remove from tree.
+ * in addition, from that point using local variables for peer_callback and
+ * inflight_invalidation as after the complete invalidation_ctx can't be accessed
+ * any more as it may be freed by the callback.
+ */
+ peer_callback = invalidation_ctx->peer_callback;
+ inflight_invalidation = invalidation_ctx->inflight_invalidation;
+ if (inflight_invalidation)
+ complete(&invalidation_ctx->comp);
+ /* On peer callback lock is handled externally */
+ if (!peer_callback)
+ /* unlocking before put_pages */
+ mutex_unlock(&ib_peer_mem->lock);
+ /* in case under callback context or callback is pending let it free the invalidation context */
+ if (!peer_callback && !inflight_invalidation)
+ kfree(invalidation_ctx);
+ }
- dev = ibdev->dma_device;
- for (i = 0; i < nents; ++i)
- dma_unmap_sg(dev, sg + i, 1, dir);
-}
+ peer_mem->dma_unmap(&umem->sg_head,
+ umem->peer_mem_client_context,
+ umem->context->device->dma_device);
+ peer_mem->put_pages(&umem->sg_head,
+ umem->peer_mem_client_context);
+
+ ib_peer_mem->stats.num_dereg_pages +=
+ umem->nmap * (umem->page_size >> PAGE_SHIFT);
+ ib_peer_mem->stats.num_dealloc_mrs += 1;
+ ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
+ umem->peer_mem_srcu_key);
+ kfree(umem);
-#define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir)
-#define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir)
+ return;
-#endif
+}
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
-#ifdef __linux__
- struct ib_umem_chunk *chunk, *tmp;
- int i;
- list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
- ib_dma_unmap_sg_attrs(dev, chunk->page_list,
- chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
- for (i = 0; i < chunk->nents; ++i) {
- struct page *page = sg_page(&chunk->page_list[i]);
- if (umem->writable && dirty)
- set_page_dirty_lock(page);
- put_page(page);
- }
- kfree(chunk);
- }
-#else
- struct ib_umem_chunk *chunk, *tmp;
vm_object_t object;
+ struct scatterlist *sg;
+ struct page *page;
int i;
object = NULL;
- list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
- ib_dma_unmap_sg_attrs(dev, chunk->page_list,
- chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
- for (i = 0; i < chunk->nents; ++i) {
- struct page *page = sg_page(&chunk->page_list[i]);
+ if (umem->nmap > 0)
+ ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+ umem->nmap,
+ DMA_BIDIRECTIONAL);
+ for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+ page = sg_page(sg);
if (umem->writable && dirty) {
if (object && object != page->object)
VM_OBJECT_WUNLOCK(object);
@@ -148,14 +207,26 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
vm_page_dirty(page);
}
}
- kfree(chunk);
- }
+ sg_free_table(&umem->sg_head);
if (object)
VM_OBJECT_WUNLOCK(object);
-#endif
}
+void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
+ umem_invalidate_func_t func,
+ void *cookie)
+{
+ struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
+
+ invalidation_ctx->func = func;
+ invalidation_ctx->cookie = cookie;
+
+ /* from that point any pending invalidations can be called */
+ mutex_unlock(&umem->ib_peer_mem->lock);
+ return;
+}
+EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
/**
* ib_umem_get - Pin and DMA map userspace memory.
* @context: userspace context to pin memory for
@@ -164,163 +235,23 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
* @access: IB_ACCESS_xxx flags for memory being pinned
* @dmasync: flush in-flight DMA when the memory region is written
*/
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
- size_t size, int access, int dmasync)
+struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync,
+ int invalidation_supported)
{
-#ifdef __linux__
- struct ib_umem *umem;
- struct page **page_list;
- struct vm_area_struct **vma_list;
- struct ib_umem_chunk *chunk;
- unsigned long locked;
- unsigned long lock_limit;
- unsigned long cur_base;
- unsigned long npages;
- int ret;
- int off;
- int i;
- DEFINE_DMA_ATTRS(attrs);
-
- if (dmasync)
- dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
- else if (allow_weak_ordering)
- dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
-
- if (!can_do_mlock())
- return ERR_PTR(-EPERM);
- umem = kmalloc(sizeof *umem, GFP_KERNEL);
- if (!umem)
- return ERR_PTR(-ENOMEM);
-
- umem->context = context;
- umem->length = size;
- umem->offset = addr & ~PAGE_MASK;
- umem->page_size = PAGE_SIZE;
- /*
- * We ask for writable memory if any access flags other than
- * "remote read" are set. "Local write" and "remote write"
- * obviously require write access. "Remote atomic" can do
- * things like fetch and add, which will modify memory, and
- * "MW bind" can change permissions by binding a window.
- */
- umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
-
- /* We assume the memory is from hugetlb until proved otherwise */
- umem->hugetlb = 1;
-
- INIT_LIST_HEAD(&umem->chunk_list);
-
- page_list = (struct page **) __get_free_page(GFP_KERNEL);
- if (!page_list) {
- kfree(umem);
- return ERR_PTR(-ENOMEM);
- }
-
- /*
- * if we can't alloc the vma_list, it's not so bad;
- * just assume the memory is not hugetlb memory
- */
- vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
- if (!vma_list)
- umem->hugetlb = 0;
-
- npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
-
- down_write(&current->mm->mmap_sem);
-
- locked = npages + current->mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-
- if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
- ret = -ENOMEM;
- goto out;
- }
-
- cur_base = addr & PAGE_MASK;
-
- ret = 0;
-
- while (npages) {
- ret = get_user_pages(current, current->mm, cur_base,
- min_t(unsigned long, npages,
- PAGE_SIZE / sizeof (struct page *)),
- 1, !umem->writable, page_list, vma_list);
-
- if (ret < 0)
- goto out;
-
- cur_base += ret * PAGE_SIZE;
- npages -= ret;
-
- off = 0;
-
- while (ret) {
- chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) *
- min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
- GFP_KERNEL);
- if (!chunk) {
- ret = -ENOMEM;
- goto out;
- }
-
- chunk->attrs = attrs;
- chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
- sg_init_table(chunk->page_list, chunk->nents);
- for (i = 0; i < chunk->nents; ++i) {
- if (vma_list &&
- !is_vm_hugetlb_page(vma_list[i + off]))
- umem->hugetlb = 0;
- sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0);
- }
-
- chunk->nmap = ib_dma_map_sg_attrs(context->device,
- &chunk->page_list[0],
- chunk->nents,
- DMA_BIDIRECTIONAL,
- &attrs);
- if (chunk->nmap <= 0) {
- for (i = 0; i < chunk->nents; ++i)
- put_page(sg_page(&chunk->page_list[i]));
- kfree(chunk);
-
- ret = -ENOMEM;
- goto out;
- }
-
- ret -= chunk->nents;
- off += chunk->nents;
- list_add_tail(&chunk->list, &umem->chunk_list);
- }
-
- ret = 0;
- }
-
-out:
- if (ret < 0) {
- __ib_umem_release(context->device, umem, 0);
- kfree(umem);
- } else
- current->mm->locked_vm = locked;
-
- up_write(&current->mm->mmap_sem);
- if (vma_list)
- free_page((unsigned long) vma_list);
- free_page((unsigned long) page_list);
-
- return ret < 0 ? ERR_PTR(ret) : umem;
-#else
struct ib_umem *umem;
- struct ib_umem_chunk *chunk;
struct proc *proc;
pmap_t pmap;
vm_offset_t end, last, start;
vm_size_t npages;
int error;
- int ents;
int ret;
+ int ents;
int i;
DEFINE_DMA_ATTRS(attrs);
+ struct scatterlist *sg, *sg_list_start;
+ int need_release = 0;
error = priv_check(curthread, PRIV_VM_MLOCK);
if (error)
@@ -372,76 +303,86 @@ out:
* "MW bind" can change permissions by binding a window.
*/
umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
+
+ if (invalidation_supported || context->peer_mem_private_data) {
+
+ struct ib_peer_memory_client *peer_mem_client;
+
+ peer_mem_client = ib_get_peer_client(context, addr, size,
+ &umem->peer_mem_client_context,
+ &umem->peer_mem_srcu_key);
+ if (peer_mem_client)
+ return peer_umem_get(peer_mem_client, umem, addr,
+ dmasync, invalidation_supported);
+ }
+
umem->hugetlb = 0;
- INIT_LIST_HEAD(&umem->chunk_list);
pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
- ret = 0;
- while (npages) {
- ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
- chunk = kmalloc(sizeof(*chunk) +
- (sizeof(struct scatterlist) * ents),
- GFP_KERNEL);
- if (!chunk) {
- ret = -ENOMEM;
+
+ if (npages == 0) {
+ ret = -EINVAL;
goto out;
}
- chunk->attrs = attrs;
- chunk->nents = ents;
- sg_init_table(&chunk->page_list[0], ents);
- for (i = 0; i < chunk->nents; ++i) {
+ ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ need_release = 1;
+ sg_list_start = umem->sg_head.sgl;
+
+ while (npages) {
+
+ ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
+ umem->npages += ents;
+
+ for_each_sg(sg_list_start, sg, ents, i) {
vm_paddr_t pa;
pa = pmap_extract(pmap, start);
if (pa == 0) {
ret = -ENOMEM;
- kfree(chunk);
goto out;
}
- sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa),
+ sg_set_page(sg, PHYS_TO_VM_PAGE(pa),
PAGE_SIZE, 0);
npages--;
start += PAGE_SIZE;
}
- chunk->nmap = ib_dma_map_sg_attrs(context->device,
- &chunk->page_list[0],
- chunk->nents,
+ /* preparing for next loop */
+ sg_list_start = sg;
+ }
+
+ umem->nmap = ib_dma_map_sg_attrs(context->device,
+ umem->sg_head.sgl,
+ umem->npages,
DMA_BIDIRECTIONAL,
&attrs);
- if (chunk->nmap != chunk->nents) {
- kfree(chunk);
+ if (umem->nmap != umem->npages) {
ret = -ENOMEM;
goto out;
}
- list_add_tail(&chunk->list, &umem->chunk_list);
- }
-
out:
if (ret < 0) {
+ if (need_release)
__ib_umem_release(context->device, umem, 0);
kfree(umem);
}
return ret < 0 ? ERR_PTR(ret) : umem;
-#endif
}
-EXPORT_SYMBOL(ib_umem_get);
+EXPORT_SYMBOL(ib_umem_get_ex);
-#ifdef __linux__
-static void ib_umem_account(struct work_struct *work)
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync)
{
- struct ib_umem *umem = container_of(work, struct ib_umem, work);
-
- down_write(&umem->mm->mmap_sem);
- umem->mm->locked_vm -= umem->diff;
- up_write(&umem->mm->mmap_sem);
- mmput(umem->mm);
- kfree(umem);
+ return ib_umem_get_ex(context, addr,
+ size, access, dmasync, 0);
}
-#endif
+EXPORT_SYMBOL(ib_umem_get);
/**
* ib_umem_release - release memory pinned with ib_umem_get
@@ -449,57 +390,28 @@ static void ib_umem_account(struct work_struct *work)
*/
void ib_umem_release(struct ib_umem *umem)
{
-#ifdef __linux__
- struct ib_ucontext *context = umem->context;
- struct mm_struct *mm;
- unsigned long diff;
-
- __ib_umem_release(umem->context->device, umem, 1);
- mm = get_task_mm(current);
- if (!mm) {
- kfree(umem);
- return;
- }
-
- diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
-
- /*
- * We may be called with the mm's mmap_sem already held. This
- * can happen when a userspace munmap() is the call that drops
- * the last reference to our file and calls our release
- * method. If there are memory regions to destroy, we'll end
- * up here and not be able to take the mmap_sem. In that case
- * we defer the vm_locked accounting to the system workqueue.
- */
- if (context->closing) {
- if (!down_write_trylock(&mm->mmap_sem)) {
- INIT_WORK(&umem->work, ib_umem_account);
- umem->mm = mm;
- umem->diff = diff;
-
- schedule_work(&umem->work);
- return;
- }
- } else
- down_write(&mm->mmap_sem);
-
- current->mm->locked_vm -= diff;
- up_write(&mm->mmap_sem);
- mmput(mm);
-#else
vm_offset_t addr, end, last, start;
vm_size_t size;
int error;
+ if (umem->ib_peer_mem) {
+ peer_umem_release(umem);
+ return;
+ }
+
__ib_umem_release(umem->context->device, umem, 1);
+
if (umem->context->closing) {
kfree(umem);
return;
}
+
error = priv_check(curthread, PRIV_VM_MUNLOCK);
+
if (error)
return;
+
addr = umem->start;
size = umem->length;
last = addr + size;
@@ -507,69 +419,24 @@ void ib_umem_release(struct ib_umem *umem)
end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
-
-#endif
kfree(umem);
+
}
EXPORT_SYMBOL(ib_umem_release);
int ib_umem_page_count(struct ib_umem *umem)
{
- struct ib_umem_chunk *chunk;
int shift;
int i;
int n;
+ struct scatterlist *sg;
shift = ilog2(umem->page_size);
n = 0;
- list_for_each_entry(chunk, &umem->chunk_list, list)
- for (i = 0; i < chunk->nmap; ++i)
- n += sg_dma_len(&chunk->page_list[i]) >> shift;
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+ n += sg_dma_len(sg) >> shift;
return n;
}
EXPORT_SYMBOL(ib_umem_page_count);
-
-/**********************************************/
-/*
- * Stub functions for contiguous pages -
- * We currently do not support this feature
- */
-/**********************************************/
-
-/**
- * ib_cmem_release_contiguous_pages - release memory allocated by
- * ib_cmem_alloc_contiguous_pages.
- * @cmem: cmem struct to release
- */
-void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
-{
-}
-EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
-
-/**
- * * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
- * * @context: userspace context to allocate memory for
- * * @total_size: total required size for that allocation.
- * * @page_size_order: order of one contiguous page.
- * */
-struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
- unsigned long total_size,
- unsigned long page_size_order)
-{
- return NULL;
-}
-EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
-
-/**
- * * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
- * * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
- * * @vma: VMA to inject pages into.
- * */
-int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
- struct vm_area_struct *vma)
-{
- return 0;
-}
-EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
diff --git a/sys/ofed/drivers/infiniband/core/user_mad.c b/sys/ofed/drivers/infiniband/core/user_mad.c
index 161c65f..cc4a659 100644
--- a/sys/ofed/drivers/infiniband/core/user_mad.c
+++ b/sys/ofed/drivers/infiniband/core/user_mad.c
@@ -43,7 +43,9 @@
#include <linux/mutex.h>
#include <linux/kref.h>
#include <linux/compat.h>
+#include <linux/sched.h>
#include <linux/semaphore.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
@@ -63,12 +65,9 @@ enum {
};
/*
- * Our lifetime rules for these structs are the following: each time a
- * device special file is opened, we look up the corresponding struct
- * ib_umad_port by minor in the umad_port[] table while holding the
- * port_lock. If this lookup succeeds, we take a reference on the
- * ib_umad_port's struct ib_umad_device while still holding the
- * port_lock; if the lookup fails, we fail the open(). We drop these
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
* references in the corresponding close().
*
* In addition to references coming from open character devices, there
@@ -76,12 +75,7 @@ enum {
* module's reference taken when allocating the ib_umad_device in
* ib_umad_add_one().
*
- * When destroying an ib_umad_device, we clear all of its
- * ib_umad_ports from umad_port[] while holding port_lock before
- * dropping the module's reference to the ib_umad_device. This is
- * always safe because any open() calls will either succeed and obtain
- * a reference before we clear the umad_port[] entries, or fail after
- * we clear the umad_port[] entries.
+ * When destroying an ib_umad_device, we drop the module's reference.
*/
struct ib_umad_port {
@@ -99,6 +93,7 @@ struct ib_umad_port {
struct ib_umad_device *umad_dev;
int dev_num;
u8 port_num;
+ struct list_head port_lst;
};
struct ib_umad_device {
@@ -135,18 +130,85 @@ static struct class *umad_class;
static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
static DEFINE_SPINLOCK(port_lock);
-static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS];
static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
static void ib_umad_add_one(struct ib_device *device);
static void ib_umad_remove_one(struct ib_device *device);
-static void ib_umad_release_dev(struct kref *ref)
+static DEFINE_SPINLOCK(ports_list_lock);
+static struct list_head ports_list;
+
+
+static void remove_ports(struct kref *ref)
+{
+ int i;
+ struct ib_umad_port *p, *p1;
+ struct ib_umad_device *dev =
+ container_of(ref, struct ib_umad_device, ref);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; ++i) {
+ struct ib_umad_port *port = &dev->port[i];
+
+ list_for_each_entry_safe(p, p1, &ports_list, port_lst)
+ if (p == port) {
+ list_del(&p->port_lst);
+ break;
+ }
+ }
+}
+
+static void put_umad_dev(struct kref *ref)
{
+ int ret, i;
struct ib_umad_device *dev =
container_of(ref, struct ib_umad_device, ref);
+ spin_lock(&ports_list_lock);
+ ret = (kref_put(ref, remove_ports));
+ spin_unlock(&ports_list_lock);
+ if (ret) {
+ for (i = 0; i <= dev->end_port - dev->start_port; ++i) {
+ if (dev->port[i].dev_num < IB_UMAD_MAX_PORTS)
+ clear_bit(dev->port[i].dev_num, dev_map);
+ else
+ clear_bit(dev->port[i].dev_num - IB_UMAD_MAX_PORTS, overflow_map);
+ cdev_del(dev->port[i].cdev);
+ cdev_del(dev->port[i].sm_cdev);
+ }
kfree(dev);
+ }
+}
+
+static void release_port(struct ib_umad_port *port)
+{
+ put_umad_dev(&port->umad_dev->ref);
+}
+
+
+static struct ib_umad_port *get_port(struct cdev *cdev)
+{
+ struct ib_umad_port *port;
+
+ spin_lock(&ports_list_lock);
+ list_for_each_entry(port, &ports_list, port_lst) {
+ if (port->cdev == cdev || port->sm_cdev == cdev) {
+ kref_get(&port->umad_dev->ref);
+ spin_unlock(&ports_list_lock);
+
+ return port;
+ }
+ }
+ spin_unlock(&ports_list_lock);
+
+ return NULL;
+}
+
+static void insert_port(struct ib_umad_port *port)
+{
+ spin_lock(&ports_list_lock);
+ list_add(&port->port_lst, &ports_list);
+ spin_unlock(&ports_list_lock);
}
static int hdr_size(struct ib_umad_file *file)
@@ -466,8 +528,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
goto err;
}
- if (packet->mad.hdr.id < 0 ||
- packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+ if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
ret = -EINVAL;
goto err;
}
@@ -679,7 +740,7 @@ found:
file->already_used = 1;
if (!file->use_pkey_index) {
printk(KERN_WARNING "user_mad: process %s did not enable "
- "P_Key index support.\n", curproc->p_comm);
+ "P_Key index support.\n", curthread->td_proc->p_comm);
printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt "
"has info on the new ABI.\n");
}
@@ -711,7 +772,7 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
mutex_lock(&file->port->file_mutex);
mutex_lock(&file->mutex);
- if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+ if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
ret = -EINVAL;
goto out;
}
@@ -779,41 +840,33 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
/*
* ib_umad_open() does not need the BKL:
*
- * - umad_port[] accesses are protected by port_lock, the
- * ib_umad_port structures are properly reference counted, and
+ * - the ib_umad_port structures are properly reference counted, and
* everything else is purely local to the file being created, so
* races against other open calls are not a problem;
* - the ioctl method does not affect any global state outside of the
* file structure being operated on;
- * - the port is added to umad_port[] as the last part of module
- * initialization so the open method will either immediately run
- * -ENXIO, or all required initialization will be done.
*/
static int ib_umad_open(struct inode *inode, struct file *filp)
{
struct ib_umad_port *port;
struct ib_umad_file *file;
- int ret = 0;
-
- spin_lock(&port_lock);
- port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE];
- if (port)
- kref_get(&port->umad_dev->ref);
- spin_unlock(&port_lock);
+ int ret;
+ port = get_port(inode->i_cdev->si_drv1);
if (!port)
return -ENXIO;
mutex_lock(&port->file_mutex);
if (!port->ib_dev) {
+ release_port(port);
ret = -ENXIO;
goto out;
}
file = kzalloc(sizeof *file, GFP_KERNEL);
if (!file) {
- kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+ release_port(port);
ret = -ENOMEM;
goto out;
}
@@ -830,6 +883,8 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
list_add_tail(&file->port_list, &port->file_list);
+ ret = nonseekable_open(inode, filp);
+
out:
mutex_unlock(&port->file_mutex);
return ret;
@@ -838,7 +893,7 @@ out:
static int ib_umad_close(struct inode *inode, struct file *filp)
{
struct ib_umad_file *file = filp->private_data;
- struct ib_umad_device *dev = file->port->umad_dev;
+ struct ib_umad_port *port = file->port;
struct ib_umad_packet *packet, *tmp;
int already_dead;
int i;
@@ -867,7 +922,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
mutex_unlock(&file->port->file_mutex);
kfree(file);
- kref_put(&dev->ref, ib_umad_release_dev);
+ release_port(port);
return 0;
}
@@ -882,7 +937,8 @@ static const struct file_operations umad_fops = {
.compat_ioctl = ib_umad_compat_ioctl,
#endif
.open = ib_umad_open,
- .release = ib_umad_close
+ .release = ib_umad_close,
+ .llseek = no_llseek,
};
static int ib_umad_sm_open(struct inode *inode, struct file *filp)
@@ -893,12 +949,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
};
int ret;
- spin_lock(&port_lock);
- port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS];
- if (port)
- kref_get(&port->umad_dev->ref);
- spin_unlock(&port_lock);
-
+ port = get_port(inode->i_cdev->si_drv1);
if (!port)
return -ENXIO;
@@ -922,10 +973,10 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
filp->private_data = port;
- return 0;
+ return nonseekable_open(inode, filp);
fail:
- kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+ release_port(port);
return ret;
}
@@ -944,7 +995,7 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
up(&port->sm_sem);
- kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+ release_port(port);
return ret;
}
@@ -952,7 +1003,8 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
static const struct file_operations umad_sm_fops = {
.owner = THIS_MODULE,
.open = ib_umad_sm_open,
- .release = ib_umad_sm_close
+ .release = ib_umad_sm_close,
+ .llseek = no_llseek,
};
static struct ib_client umad_client = {
@@ -991,31 +1043,66 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *att
}
static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+static dev_t overflow_maj;
+static int find_overflow_devnum(void)
+{
+ int ret;
+
+ if (!overflow_maj) {
+ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
+ "infiniband_mad");
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't register dynamic device number\n");
+ return ret;
+ }
+ }
+
+ ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS);
+ if (ret >= IB_UMAD_MAX_PORTS)
+ return -1;
+
+ return ret;
+}
+
static int ib_umad_init_port(struct ib_device *device, int port_num,
struct ib_umad_port *port)
{
+ int devnum;
+ dev_t base;
+
spin_lock(&port_lock);
- port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
- if (port->dev_num >= IB_UMAD_MAX_PORTS) {
+ devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+ if (devnum >= IB_UMAD_MAX_PORTS) {
spin_unlock(&port_lock);
+ devnum = find_overflow_devnum();
+ if (devnum < 0)
return -1;
+
+ spin_lock(&port_lock);
+ port->dev_num = devnum + IB_UMAD_MAX_PORTS;
+ base = devnum + overflow_maj;
+ set_bit(devnum, overflow_map);
+ } else {
+ port->dev_num = devnum;
+ base = devnum + base_dev;
+ set_bit(devnum, dev_map);
}
- set_bit(port->dev_num, dev_map);
spin_unlock(&port_lock);
port->ib_dev = device;
port->port_num = port_num;
- init_MUTEX(&port->sm_sem);
+ sema_init(&port->sm_sem, 1);
mutex_init(&port->file_mutex);
INIT_LIST_HEAD(&port->file_list);
port->cdev = cdev_alloc();
if (!port->cdev)
- return -1;
- port->cdev->owner = THIS_MODULE;
+ goto err_cdev_c;
+
port->cdev->ops = &umad_fops;
+ port->cdev->owner = THIS_MODULE;
kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num);
- if (cdev_add(port->cdev, base_dev + port->dev_num, 1))
+ if (cdev_add(port->cdev, base, 1))
goto err_cdev;
port->dev = device_create(umad_class, device->dma_device,
@@ -1029,13 +1116,15 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
if (device_create_file(port->dev, &dev_attr_port))
goto err_dev;
+ base += IB_UMAD_MAX_PORTS;
port->sm_cdev = cdev_alloc();
if (!port->sm_cdev)
goto err_dev;
- port->sm_cdev->owner = THIS_MODULE;
+
port->sm_cdev->ops = &umad_sm_fops;
+ port->sm_cdev->owner = THIS_MODULE;
kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num);
- if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1))
+ if (cdev_add(port->sm_cdev, base, 1))
goto err_sm_cdev;
port->sm_dev = device_create(umad_class, device->dma_device,
@@ -1049,10 +1138,6 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
if (device_create_file(port->sm_dev, &dev_attr_port))
goto err_sm_dev;
- spin_lock(&port_lock);
- umad_port[port->dev_num] = port;
- spin_unlock(&port_lock);
-
return 0;
err_sm_dev:
@@ -1066,7 +1151,11 @@ err_dev:
err_cdev:
cdev_del(port->cdev);
- clear_bit(port->dev_num, dev_map);
+err_cdev_c:
+ if (port->dev_num < IB_UMAD_MAX_PORTS)
+ clear_bit(devnum, dev_map);
+ else
+ clear_bit(devnum, overflow_map);
return -1;
}
@@ -1074,7 +1163,6 @@ err_cdev:
static void ib_umad_kill_port(struct ib_umad_port *port)
{
struct ib_umad_file *file;
- int already_dead;
int id;
dev_set_drvdata(port->dev, NULL);
@@ -1083,20 +1171,12 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
device_destroy(umad_class, port->cdev->dev);
device_destroy(umad_class, port->sm_cdev->dev);
- cdev_del(port->cdev);
- cdev_del(port->sm_cdev);
-
- spin_lock(&port_lock);
- umad_port[port->dev_num] = NULL;
- spin_unlock(&port_lock);
-
mutex_lock(&port->file_mutex);
port->ib_dev = NULL;
list_for_each_entry(file, &port->file_list, port_list) {
mutex_lock(&file->mutex);
- already_dead = file->agents_dead;
file->agents_dead = 1;
mutex_unlock(&file->mutex);
@@ -1106,8 +1186,6 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
}
mutex_unlock(&port->file_mutex);
-
- clear_bit(port->dev_num, dev_map);
}
static void ib_umad_add_one(struct ib_device *device)
@@ -1136,10 +1214,12 @@ static void ib_umad_add_one(struct ib_device *device)
umad_dev->start_port = s;
umad_dev->end_port = e;
+ for (i = 0; i <= e - s; ++i)
+ insert_port(&umad_dev->port[i]);
+
for (i = s; i <= e; ++i) {
umad_dev->port[i - s].umad_dev = umad_dev;
- if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
if (ib_umad_init_port(device, i, &umad_dev->port[i - s]))
goto err;
}
@@ -1150,10 +1230,9 @@ static void ib_umad_add_one(struct ib_device *device)
err:
while (--i >= s)
- if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
ib_umad_kill_port(&umad_dev->port[i - s]);
- kref_put(&umad_dev->ref, ib_umad_release_dev);
+ put_umad_dev(&umad_dev->ref);
}
static void ib_umad_remove_one(struct ib_device *device)
@@ -1165,16 +1244,22 @@ static void ib_umad_remove_one(struct ib_device *device)
return;
for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
ib_umad_kill_port(&umad_dev->port[i]);
- kref_put(&umad_dev->ref, ib_umad_release_dev);
+ put_umad_dev(&umad_dev->ref);
+}
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
}
static int __init ib_umad_init(void)
{
int ret;
+ INIT_LIST_HEAD(&ports_list);
+
ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
"infiniband_mad");
if (ret) {
@@ -1189,6 +1274,8 @@ static int __init ib_umad_init(void)
goto out_chrdev;
}
+ umad_class->devnode = umad_devnode;
+
ret = class_create_file(umad_class, &class_attr_abi_version);
if (ret) {
printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
@@ -1218,6 +1305,8 @@ static void __exit ib_umad_cleanup(void)
ib_unregister_client(&umad_client);
class_destroy(umad_class);
unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+ if (overflow_maj)
+ unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
}
module_init(ib_umad_init);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs.h b/sys/ofed/drivers/infiniband/core/uverbs.h
index fa64da5..8ca6498 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs.h
+++ b/sys/ofed/drivers/infiniband/core/uverbs.h
@@ -41,10 +41,14 @@
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/completion.h>
+#include <linux/cdev.h>
+#include <linux/rbtree.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_verbs_exp.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_verbs_exp.h>
/*
* Our lifetime rules for these structs are the following:
@@ -69,24 +73,26 @@
struct ib_uverbs_device {
struct kref ref;
+ int num_comp_vectors;
struct completion comp;
- int devnum;
- struct cdev *cdev;
struct device *dev;
struct ib_device *ib_dev;
- int num_comp_vectors;
+ int devnum;
+ struct cdev cdev;
+ struct rb_root xrcd_tree;
+ struct mutex xrcd_tree_mutex;
};
struct ib_uverbs_event_file {
struct kref ref;
struct file *filp;
+ int is_async;
struct ib_uverbs_file *uverbs_file;
spinlock_t lock;
+ int is_closed;
wait_queue_head_t poll_wait;
struct fasync_struct *async_queue;
struct list_head event_list;
- int is_async;
- int is_closed;
};
struct ib_uverbs_file {
@@ -120,9 +126,20 @@ struct ib_uevent_object {
u32 events_reported;
};
+struct ib_uxrcd_object {
+ struct ib_uobject uobject;
+ atomic_t refcnt;
+};
+
+struct ib_usrq_object {
+ struct ib_uevent_object uevent;
+ struct ib_uxrcd_object *uxrcd;
+};
+
struct ib_uqp_object {
struct ib_uevent_object uevent;
struct list_head mcast_list;
+ struct ib_uxrcd_object *uxrcd;
};
struct ib_ucq_object {
@@ -134,9 +151,8 @@ struct ib_ucq_object {
u32 async_events_reported;
};
-struct ib_uxrcd_object {
+struct ib_udct_object {
struct ib_uobject uobject;
- struct list_head xrc_reg_qp_list;
};
extern spinlock_t ib_uverbs_idr_lock;
@@ -147,12 +163,14 @@ extern struct idr ib_uverbs_ah_idr;
extern struct idr ib_uverbs_cq_idr;
extern struct idr ib_uverbs_qp_idr;
extern struct idr ib_uverbs_srq_idr;
-extern struct idr ib_uverbs_xrc_domain_idr;
+extern struct idr ib_uverbs_xrcd_idr;
+extern struct idr ib_uverbs_rule_idr;
+extern struct idr ib_uverbs_dct_idr;
void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
- int is_async, int *fd);
+ int is_async);
struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -167,12 +185,24 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);
-void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
- void *context_ptr);
-void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
- struct ib_xrcd *xrcd);
-int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
- struct ib_xrcd *xrcd, u32 qp_num);
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
+
+struct ib_uverbs_flow_spec {
+ union {
+ union {
+ struct ib_uverbs_flow_spec_hdr hdr;
+ struct {
+ __u32 type;
+ __u16 size;
+ __u16 reserved;
+ };
+ };
+ struct ib_uverbs_flow_spec_eth eth;
+ struct ib_uverbs_flow_spec_ib ib;
+ struct ib_uverbs_flow_spec_ipv4 ipv4;
+ struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+ };
+};
#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
@@ -186,6 +216,8 @@ IB_UVERBS_DECLARE_CMD(alloc_pd);
IB_UVERBS_DECLARE_CMD(dealloc_pd);
IB_UVERBS_DECLARE_CMD(reg_mr);
IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(alloc_mw);
+IB_UVERBS_DECLARE_CMD(dealloc_mw);
IB_UVERBS_DECLARE_CMD(create_comp_channel);
IB_UVERBS_DECLARE_CMD(create_cq);
IB_UVERBS_DECLARE_CMD(resize_cq);
@@ -193,6 +225,7 @@ IB_UVERBS_DECLARE_CMD(poll_cq);
IB_UVERBS_DECLARE_CMD(req_notify_cq);
IB_UVERBS_DECLARE_CMD(destroy_cq);
IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(open_qp);
IB_UVERBS_DECLARE_CMD(query_qp);
IB_UVERBS_DECLARE_CMD(modify_qp);
IB_UVERBS_DECLARE_CMD(destroy_qp);
@@ -207,14 +240,30 @@ IB_UVERBS_DECLARE_CMD(create_srq);
IB_UVERBS_DECLARE_CMD(modify_srq);
IB_UVERBS_DECLARE_CMD(query_srq);
IB_UVERBS_DECLARE_CMD(destroy_srq);
-IB_UVERBS_DECLARE_CMD(create_xrc_srq);
-IB_UVERBS_DECLARE_CMD(open_xrc_domain);
-IB_UVERBS_DECLARE_CMD(close_xrc_domain);
-IB_UVERBS_DECLARE_CMD(create_xrc_rcv_qp);
-IB_UVERBS_DECLARE_CMD(modify_xrc_rcv_qp);
-IB_UVERBS_DECLARE_CMD(query_xrc_rcv_qp);
-IB_UVERBS_DECLARE_CMD(reg_xrc_rcv_qp);
-IB_UVERBS_DECLARE_CMD(unreg_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(create_xsrq);
+IB_UVERBS_DECLARE_CMD(open_xrcd);
+IB_UVERBS_DECLARE_CMD(close_xrcd);
+
+#define IB_UVERBS_DECLARE_EX_CMD(name) \
+ int ib_uverbs_ex_##name(struct ib_uverbs_file *file,\
+ struct ib_udata *ucore, \
+ struct ib_udata *uhw)
+
+#define IB_UVERBS_DECLARE_EXP_CMD(name) \
+ ssize_t ib_uverbs_exp_##name(struct ib_uverbs_file *file, \
+ struct ib_udata *ucore, \
+ struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+IB_UVERBS_DECLARE_EXP_CMD(create_qp);
+IB_UVERBS_DECLARE_EXP_CMD(modify_cq);
+IB_UVERBS_DECLARE_EXP_CMD(modify_qp);
+IB_UVERBS_DECLARE_EXP_CMD(create_cq);
+IB_UVERBS_DECLARE_EXP_CMD(query_device);
+IB_UVERBS_DECLARE_EXP_CMD(create_dct);
+IB_UVERBS_DECLARE_EXP_CMD(destroy_dct);
+IB_UVERBS_DECLARE_EXP_CMD(query_dct);
#endif /* UVERBS_H */
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
index a34b344..5eef3f7 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
+++ b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
@@ -35,28 +35,68 @@
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/moduleparam.h>
+#include <linux/rbtree.h>
#include <linux/lockdep.h>
+#include <rdma/ib_addr.h>
#include <asm/uaccess.h>
#include <asm/fcntl.h>
+#include <sys/priv.h>
#include "uverbs.h"
-static struct lock_class_key pd_lock_key;
-static struct lock_class_key mr_lock_key;
-static struct lock_class_key cq_lock_key;
-static struct lock_class_key qp_lock_key;
-static struct lock_class_key ah_lock_key;
-static struct lock_class_key srq_lock_key;
+static int disable_raw_qp_enforcement;
+module_param_named(disable_raw_qp_enforcement, disable_raw_qp_enforcement, int,
+ 0444);
+MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for "
+ "being opened by root (default: 0)");
+
+struct uverbs_lock_class {
+ struct lock_class_key key;
+ char name[16];
+};
+
+static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" };
+static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" };
+static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" };
+static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" };
+static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" };
+static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" };
+static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
+static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
+static struct uverbs_lock_class dct_lock_class = { .name = "DCT-uobj" };
+
+static int uverbs_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+{
+ return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
+}
+
+static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+{
+ return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+}
+
+static struct ib_udata_ops uverbs_copy = {
+ .copy_from = uverbs_copy_from_udata,
+ .copy_to = uverbs_copy_to_udata
+};
#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \
do { \
+ (udata)->ops = &uverbs_copy; \
(udata)->inbuf = (void __user *) (ibuf); \
(udata)->outbuf = (void __user *) (obuf); \
(udata)->inlen = (ilen); \
(udata)->outlen = (olen); \
} while (0)
+enum uverbs_cmd_type {
+ IB_USER_VERBS_CMD_BASIC,
+ IB_USER_VERBS_CMD_EXTENDED
+};
+
/*
* The ib_uobject locking scheme is as follows:
*
@@ -83,13 +123,13 @@ static struct lock_class_key srq_lock_key;
*/
static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
- struct ib_ucontext *context, struct lock_class_key *key)
+ struct ib_ucontext *context, struct uverbs_lock_class *c)
{
uobj->user_handle = user_handle;
uobj->context = context;
kref_init(&uobj->ref);
init_rwsem(&uobj->mutex);
- lockdep_set_class(&uobj->mutex, key);
+ lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name);
uobj->live = 0;
}
@@ -241,11 +281,34 @@ static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
}
+static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj;
+
+ uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context);
+ return uobj ? uobj->object : NULL;
+}
+
static void put_qp_read(struct ib_qp *qp)
{
put_uobj_read(qp->uobject);
}
+static void put_qp_write(struct ib_qp *qp)
+{
+ put_uobj_write(qp->uobject);
+}
+
+static struct ib_dct *idr_read_dct(int dct_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_dct_idr, dct_handle, context, 0);
+}
+
+static void put_dct_read(struct ib_dct *dct)
+{
+ put_uobj_read(dct->uobject);
+}
+
static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
{
return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
@@ -256,12 +319,10 @@ static void put_srq_read(struct ib_srq *srq)
put_uobj_read(srq->uobject);
}
-static struct ib_xrcd *idr_read_xrcd(int xrcd_handle,
- struct ib_ucontext *context,
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context,
struct ib_uobject **uobj)
{
- *uobj = idr_read_uobj(&ib_uverbs_xrc_domain_idr, xrcd_handle,
- context, 0);
+ *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0);
return *uobj ? (*uobj)->object : NULL;
}
@@ -301,7 +362,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext = ibdev->alloc_ucontext(ibdev, &udata);
if (IS_ERR(ucontext)) {
- ret = PTR_ERR(file->ucontext);
+ ret = PTR_ERR(ucontext);
goto err;
}
@@ -314,20 +375,23 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&ucontext->srq_list);
INIT_LIST_HEAD(&ucontext->ah_list);
INIT_LIST_HEAD(&ucontext->xrcd_list);
+ INIT_LIST_HEAD(&ucontext->rule_list);
+ INIT_LIST_HEAD(&ucontext->dct_list);
ucontext->closing = 0;
+ ucontext->peer_mem_private_data = NULL;
+ ucontext->peer_mem_name = NULL;
resp.num_comp_vectors = file->device->num_comp_vectors;
- filp = ib_uverbs_alloc_event_file(file, 1, &resp.async_fd);
- if (IS_ERR(filp)) {
- ret = PTR_ERR(filp);
+ ret = get_unused_fd();
+ if (ret < 0)
goto err_free;
- }
+ resp.async_fd = ret;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
- ret = -EFAULT;
- goto err_file;
+ filp = ib_uverbs_alloc_event_file(file, 1);
+ if (IS_ERR(filp)) {
+ ret = PTR_ERR(filp);
+ goto err_fd;
}
file->async_file = filp->private_data;
@@ -338,6 +402,11 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
if (ret)
goto err_file;
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_file;
+ }
kref_get(&file->async_file->ref);
kref_get(&file->ref);
file->ucontext = ucontext;
@@ -349,9 +418,11 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
return in_len;
err_file:
- put_unused_fd(resp.async_fd);
fput(filp);
+err_fd:
+ put_unused_fd(resp.async_fd);
+
err_free:
ibdev->dealloc_ucontext(ucontext);
@@ -360,6 +431,55 @@ err:
return ret;
}
+static void ib_uverbs_query_device_assign(
+ struct ib_uverbs_query_device_resp *resp,
+ struct ib_device_attr *attr,
+ struct ib_uverbs_file *file)
+{
+ memset(resp, 0, sizeof(*resp));
+
+ resp->fw_ver = attr->fw_ver;
+ resp->node_guid = file->device->ib_dev->node_guid;
+ resp->sys_image_guid = attr->sys_image_guid;
+ resp->max_mr_size = attr->max_mr_size;
+ resp->page_size_cap = attr->page_size_cap;
+ resp->vendor_id = attr->vendor_id;
+ resp->vendor_part_id = attr->vendor_part_id;
+ resp->hw_ver = attr->hw_ver;
+ resp->max_qp = attr->max_qp;
+ resp->max_qp_wr = attr->max_qp_wr;
+ resp->device_cap_flags = attr->device_cap_flags;
+ resp->max_sge = attr->max_sge;
+ resp->max_sge_rd = attr->max_sge_rd;
+ resp->max_cq = attr->max_cq;
+ resp->max_cqe = attr->max_cqe;
+ resp->max_mr = attr->max_mr;
+ resp->max_pd = attr->max_pd;
+ resp->max_qp_rd_atom = attr->max_qp_rd_atom;
+ resp->max_ee_rd_atom = attr->max_ee_rd_atom;
+ resp->max_res_rd_atom = attr->max_res_rd_atom;
+ resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
+ resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
+ resp->atomic_cap = attr->atomic_cap;
+ resp->max_ee = attr->max_ee;
+ resp->max_rdd = attr->max_rdd;
+ resp->max_mw = attr->max_mw;
+ resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
+ resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
+ resp->max_mcast_grp = attr->max_mcast_grp;
+ resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
+ resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
+ resp->max_ah = attr->max_ah;
+ resp->max_fmr = attr->max_fmr;
+ resp->max_map_per_fmr = attr->max_map_per_fmr;
+ resp->max_srq = attr->max_srq;
+ resp->max_srq_wr = attr->max_srq_wr;
+ resp->max_srq_sge = attr->max_srq_sge;
+ resp->max_pkeys = attr->max_pkeys;
+ resp->local_ca_ack_delay = attr->local_ca_ack_delay;
+ resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+}
+
ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
const char __user *buf,
int in_len, int out_len)
@@ -379,51 +499,10 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
if (ret)
return ret;
- memset(&resp, 0, sizeof resp);
-
- resp.fw_ver = attr.fw_ver;
- resp.node_guid = file->device->ib_dev->node_guid;
- resp.sys_image_guid = attr.sys_image_guid;
- resp.max_mr_size = attr.max_mr_size;
- resp.page_size_cap = attr.page_size_cap;
- resp.vendor_id = attr.vendor_id;
- resp.vendor_part_id = attr.vendor_part_id;
- resp.hw_ver = attr.hw_ver;
- resp.max_qp = attr.max_qp;
- resp.max_qp_wr = attr.max_qp_wr;
- resp.device_cap_flags = attr.device_cap_flags;
- resp.max_sge = attr.max_sge;
- resp.max_sge_rd = attr.max_sge_rd;
- resp.max_cq = attr.max_cq;
- resp.max_cqe = attr.max_cqe;
- resp.max_mr = attr.max_mr;
- resp.max_pd = attr.max_pd;
- resp.max_qp_rd_atom = attr.max_qp_rd_atom;
- resp.max_ee_rd_atom = attr.max_ee_rd_atom;
- resp.max_res_rd_atom = attr.max_res_rd_atom;
- resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom;
- resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom;
- resp.atomic_cap = attr.atomic_cap;
- resp.max_ee = attr.max_ee;
- resp.max_rdd = attr.max_rdd;
- resp.max_mw = attr.max_mw;
- resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp;
- resp.max_raw_ethy_qp = attr.max_raw_ethy_qp;
- resp.max_mcast_grp = attr.max_mcast_grp;
- resp.max_mcast_qp_attach = attr.max_mcast_qp_attach;
- resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
- resp.max_ah = attr.max_ah;
- resp.max_fmr = attr.max_fmr;
- resp.max_map_per_fmr = attr.max_map_per_fmr;
- resp.max_srq = attr.max_srq;
- resp.max_srq_wr = attr.max_srq_wr;
- resp.max_srq_sge = attr.max_srq_sge;
- resp.max_pkeys = attr.max_pkeys;
- resp.local_ca_ack_delay = attr.local_ca_ack_delay;
- resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+ ib_uverbs_query_device_assign(&resp, &attr, file);
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp))
+ if (copy_to_user((void __user *)(unsigned long) cmd.response,
+ &resp, sizeof(resp)))
return -EFAULT;
return in_len;
@@ -469,7 +548,8 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width = attr.active_width;
resp.active_speed = attr.active_speed;
resp.phys_state = attr.phys_state;
- resp.link_layer = attr.link_layer;
+ resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev,
+ cmd.port_num);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp))
@@ -503,7 +583,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
if (!uobj)
return -ENOMEM;
- init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
+ init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
down_write(&uobj->mutex);
pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
@@ -587,17 +667,316 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
return in_len;
}
-ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+struct xrcd_table_entry {
+ struct rb_node node;
+ struct ib_xrcd *xrcd;
+ struct inode *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+ struct inode *inode,
+ struct ib_xrcd *xrcd)
+{
+ struct xrcd_table_entry *entry, *scan;
+ struct rb_node **p = &dev->xrcd_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ entry = kmalloc(sizeof *entry, GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->xrcd = xrcd;
+ entry->inode = inode;
+
+ while (*p) {
+ parent = *p;
+ scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+ if (inode < scan->inode) {
+ p = &(*p)->rb_left;
+ } else if (inode > scan->inode) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(entry);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&entry->node, parent, p);
+ rb_insert_color(&entry->node, &dev->xrcd_tree);
+ igrab(inode);
+ return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+ struct rb_node *p = dev->xrcd_tree.rb_node;
+
+ while (p) {
+ entry = rb_entry(p, struct xrcd_table_entry, node);
+
+ if (inode < entry->inode)
+ p = p->rb_left;
+ else if (inode > entry->inode)
+ p = p->rb_right;
+ else
+ return entry;
+ }
+
+ return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (!entry)
+ return NULL;
+
+ return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (entry) {
+ iput(inode);
+ rb_erase(&entry->node, &dev->xrcd_tree);
+ kfree(entry);
+ }
+}
+
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
const char __user *buf, int in_len,
int out_len)
{
+ struct ib_uverbs_open_xrcd cmd;
+ struct ib_uverbs_open_xrcd_resp resp;
+ struct ib_udata udata;
+ struct ib_uxrcd_object *obj;
+ struct ib_xrcd *xrcd = NULL;
+ struct fd f = {NULL};
+ struct inode *inode = NULL;
+ int ret = 0;
+ int new_xrcd = 0;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ mutex_lock(&file->device->xrcd_tree_mutex);
+
+ if (cmd.fd != -1) {
+ /* search for file descriptor */
+ f = fdget(cmd.fd);
+ if (!f.file) {
+ ret = -EBADF;
+ goto err_tree_mutex_unlock;
+ }
+
+ inode = f.file->f_dentry->d_inode;
+ xrcd = find_xrcd(file->device, inode);
+ if (!xrcd && !(cmd.oflags & O_CREAT)) {
+ /* no file descriptor. Need CREATE flag */
+ ret = -EAGAIN;
+ goto err_tree_mutex_unlock;
+ }
+
+ if (xrcd && cmd.oflags & O_EXCL) {
+ ret = -EINVAL;
+ goto err_tree_mutex_unlock;
+ }
+ }
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj) {
+ ret = -ENOMEM;
+ goto err_tree_mutex_unlock;
+ }
+
+ init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class);
+
+ down_write(&obj->uobject.mutex);
+
+ if (!xrcd) {
+ xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+ file->ucontext, &udata);
+ if (IS_ERR(xrcd)) {
+ ret = PTR_ERR(xrcd);
+ goto err;
+ }
+
+ xrcd->inode = inode;
+ xrcd->device = file->device->ib_dev;
+ atomic_set(&xrcd->usecnt, 0);
+ mutex_init(&xrcd->tgt_qp_mutex);
+ INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+ new_xrcd = 1;
+ }
+
+ atomic_set(&obj->refcnt, 0);
+ obj->uobject.object = xrcd;
+ ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+ if (ret)
+ goto err_idr;
+
+ memset(&resp, 0, sizeof resp);
+ resp.xrcd_handle = obj->uobject.id;
+
+ if (inode) {
+ if (new_xrcd) {
+ /* create new inode/xrcd table entry */
+ ret = xrcd_table_insert(file->device, inode, xrcd);
+ if (ret)
+ goto err_insert_xrcd;
+ }
+ atomic_inc(&xrcd->usecnt);
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ if (f.file)
+ fdput(f);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uobject.live = 1;
+ up_write(&obj->uobject.mutex);
+
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+ return in_len;
+
+err_copy:
+ if (inode) {
+ if (new_xrcd)
+ xrcd_table_delete(file->device, inode);
+ atomic_dec(&xrcd->usecnt);
+ }
+
+err_insert_xrcd:
+ idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+
+err_idr:
+ ib_dealloc_xrcd(xrcd);
+
+err:
+ put_uobj_write(&obj->uobject);
+
+err_tree_mutex_unlock:
+ if (f.file)
+ fdput(f);
+
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_close_xrcd cmd;
+ struct ib_uobject *uobj;
+ struct ib_xrcd *xrcd = NULL;
+ struct inode *inode = NULL;
+ struct ib_uxrcd_object *obj;
+ int live;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ mutex_lock(&file->device->xrcd_tree_mutex);
+ uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext);
+ if (!uobj) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ xrcd = uobj->object;
+ inode = xrcd->inode;
+ obj = container_of(uobj, struct ib_uxrcd_object, uobject);
+ if (atomic_read(&obj->refcnt)) {
+ put_uobj_write(uobj);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (!inode || atomic_dec_and_test(&xrcd->usecnt)) {
+ ret = ib_dealloc_xrcd(uobj->object);
+ if (!ret)
+ uobj->live = 0;
+ }
+
+ live = uobj->live;
+ if (inode && ret)
+ atomic_inc(&xrcd->usecnt);
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ goto out;
+
+ if (inode && !live)
+ xrcd_table_delete(file->device, inode);
+
+ idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+ ret = in_len;
+
+out:
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+ return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
+ struct ib_xrcd *xrcd)
+{
+ struct inode *inode;
+
+ inode = xrcd->inode;
+ if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+ return;
+
+ ib_dealloc_xrcd(xrcd);
+
+ if (inode)
+ xrcd_table_delete(dev, inode);
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
struct ib_uverbs_reg_mr cmd;
struct ib_uverbs_reg_mr_resp resp;
- struct ib_udata udata;
+ struct ib_udata udata;
struct ib_uobject *uobj;
struct ib_pd *pd;
struct ib_mr *mr;
- int ret;
+ int ret;
if (out_len < sizeof resp)
return -ENOSPC;
@@ -612,32 +991,34 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
return -EINVAL;
- /*
- * Local write permission is required if remote write or
- * remote atomic permission is also requested.
- */
- if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
- !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
- return -EINVAL;
+ ret = ib_check_mr_access(cmd.access_flags);
+ if (ret)
+ return ret;
uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
if (!uobj)
return -ENOMEM;
- init_uobj(uobj, 0, file->ucontext, &mr_lock_key);
+ init_uobj(uobj, 0, file->ucontext, &mr_lock_class);
down_write(&uobj->mutex);
pd = idr_read_pd(cmd.pd_handle, file->ucontext);
if (!pd) {
- ret = -EINVAL;
+ ret = -EINVAL;
goto err_free;
}
+ /* We first get a new "obj id" to be passed later to reg mr for
+ further use as mr_id.
+ */
+ ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+ if (ret)
+ goto err_put;
mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
- cmd.access_flags, &udata, 0);
+ cmd.access_flags, &udata, uobj->id);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
- goto err_put;
+ goto err_remove_uobj;
}
mr->device = pd->device;
@@ -647,9 +1028,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
atomic_set(&mr->usecnt, 0);
uobj->object = mr;
- ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
- if (ret)
- goto err_unreg;
memset(&resp, 0, sizeof resp);
resp.lkey = mr->lkey;
@@ -675,11 +1053,11 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
return in_len;
err_copy:
- idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
-
-err_unreg:
ib_dereg_mr(mr);
+err_remove_uobj:
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
err_put:
put_pd_read(pd);
@@ -689,13 +1067,13 @@ err_free:
}
ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_dereg_mr cmd;
struct ib_mr *mr;
struct ib_uobject *uobj;
- int ret = -EINVAL;
+ int ret = -EINVAL;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -726,13 +1104,134 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
return in_len;
}
+ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_alloc_mw cmd;
+ struct ib_uverbs_alloc_mw_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mw *mw;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, 0, file->ucontext, &mw_lock_class);
+ down_write(&uobj->mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ mw = pd->device->alloc_mw(pd, cmd.mw_type);
+ if (IS_ERR(mw)) {
+ ret = PTR_ERR(mw);
+ goto err_put;
+ }
+
+ mw->device = pd->device;
+ mw->pd = pd;
+ mw->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+
+ uobj->object = mw;
+ ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj);
+ if (ret)
+ goto err_unalloc;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.rkey = mw->rkey;
+ resp.mw_handle = uobj->id;
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->mw_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+err_unalloc:
+ ib_dealloc_mw(mw);
+
+err_put:
+ put_pd_read(pd);
+
+err_free:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dealloc_mw cmd;
+ struct ib_mw *mw;
+ struct ib_uobject *uobj;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+
+ mw = uobj->object;
+
+ ret = ib_dealloc_mw(mw);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_create_comp_channel cmd;
struct ib_uverbs_create_comp_channel_resp resp;
struct file *filp;
+ int ret;
if (out_len < sizeof resp)
return -ENOSPC;
@@ -740,9 +1239,16 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- filp = ib_uverbs_alloc_event_file(file, 0, &resp.fd);
- if (IS_ERR(filp))
+ ret = get_unused_fd();
+ if (ret < 0)
+ return ret;
+ resp.fd = ret;
+
+ filp = ib_uverbs_alloc_event_file(file, 0);
+ if (IS_ERR(filp)) {
+ put_unused_fd(resp.fd);
return PTR_ERR(filp);
+ }
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp)) {
@@ -755,40 +1261,44 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
return in_len;
}
-ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static ssize_t create_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len, void *vcmd, int ex,
+ void __user *response)
{
- struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_create_cq *cmd;
+ struct ib_uverbs_create_cq_ex *cmd_e;
struct ib_uverbs_create_cq_resp resp;
struct ib_udata udata;
struct ib_ucq_object *obj;
struct ib_uverbs_event_file *ev_file = NULL;
struct ib_cq *cq;
+ struct ib_cq_init_attr attr;
+ int cmd_sz;
int ret;
if (out_len < sizeof resp)
return -ENOSPC;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ cmd = vcmd;
+ cmd_e = vcmd;
+ cmd_sz = ex ? sizeof(*cmd_e) : sizeof(*cmd);
+ INIT_UDATA(&udata, buf + cmd_sz, response + sizeof(resp),
+ in_len - sizeof(cmd), out_len - sizeof(resp));
- if (cmd.comp_vector >= file->device->num_comp_vectors)
+ if (cmd->comp_vector >= file->device->num_comp_vectors)
return -EINVAL;
obj = kmalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
return -ENOMEM;
- init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_key);
+ init_uobj(&obj->uobject, cmd->user_handle, file->ucontext,
+ &cq_lock_class);
down_write(&obj->uobject.mutex);
- if (cmd.comp_channel >= 0) {
- ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+ if (cmd->comp_channel >= 0) {
+ ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel);
if (!ev_file) {
ret = -EINVAL;
goto err;
@@ -801,8 +1311,12 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
- cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
- cmd.comp_vector,
+ memset(&attr, 0, sizeof(attr));
+ attr.cqe = cmd->cqe;
+ attr.comp_vector = cmd->comp_vector;
+ if (ex && (cmd_e->comp_mask & IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS))
+ attr.flags = cmd_e->create_flags;
+ cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
file->ucontext, &udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
@@ -825,8 +1339,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
resp.cq_handle = obj->uobject.id;
resp.cqe = cq->cqe;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
+ if (copy_to_user(response, &resp, sizeof(resp))) {
ret = -EFAULT;
goto err_copy;
}
@@ -856,6 +1369,19 @@ err:
return ret;
}
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_cq cmd;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ return create_cq(file, buf, in_len, out_len, &cmd,
+ IB_USER_VERBS_CMD_BASIC, (void __user *)cmd.response);
+}
+
ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
const char __user *buf, int in_len,
int out_len)
@@ -893,68 +1419,81 @@ out:
return ret ? ret : in_len;
}
+static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
+{
+ struct ib_uverbs_wc tmp;
+
+ tmp.wr_id = wc->wr_id;
+ tmp.status = wc->status;
+ tmp.opcode = wc->opcode;
+ tmp.vendor_err = wc->vendor_err;
+ tmp.byte_len = wc->byte_len;
+ tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data;
+ tmp.qp_num = wc->qp->qp_num;
+ tmp.src_qp = wc->src_qp;
+ tmp.wc_flags = wc->wc_flags;
+ tmp.pkey_index = wc->pkey_index;
+ tmp.slid = wc->slid;
+ tmp.sl = wc->sl;
+ tmp.dlid_path_bits = wc->dlid_path_bits;
+ tmp.port_num = wc->port_num;
+ tmp.reserved = 0;
+
+ if (copy_to_user(dest, &tmp, sizeof tmp))
+ return -EFAULT;
+
+ return 0;
+}
+
ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_poll_cq cmd;
- struct ib_uverbs_poll_cq_resp *resp;
+ struct ib_uverbs_poll_cq_resp resp;
+ u8 __user *header_ptr;
+ u8 __user *data_ptr;
struct ib_cq *cq;
- struct ib_wc *wc;
- int ret = 0;
- int i;
- int rsize;
+ struct ib_wc wc;
+ int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- wc = kmalloc(cmd.ne * sizeof *wc, GFP_KERNEL);
- if (!wc)
- return -ENOMEM;
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq)
+ return -EINVAL;
- rsize = sizeof *resp + cmd.ne * sizeof(struct ib_uverbs_wc);
- resp = kmalloc(rsize, GFP_KERNEL);
- if (!resp) {
- ret = -ENOMEM;
- goto out_wc;
- }
+ /* we copy a struct ib_uverbs_poll_cq_resp to user space */
+ header_ptr = (void __user *)(unsigned long) cmd.response;
+ data_ptr = header_ptr + sizeof resp;
- cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
- if (!cq) {
- ret = -EINVAL;
- goto out;
- }
+ memset(&resp, 0, sizeof resp);
+ while (resp.count < cmd.ne) {
+ ret = ib_poll_cq(cq, 1, &wc);
+ if (ret < 0)
+ goto out_put;
+ if (!ret)
+ break;
- resp->count = ib_poll_cq(cq, cmd.ne, wc);
+ ret = copy_wc_to_user(data_ptr, &wc);
+ if (ret)
+ goto out_put;
- put_cq_read(cq);
+ data_ptr += sizeof(struct ib_uverbs_wc);
+ ++resp.count;
+ }
- for (i = 0; i < resp->count; i++) {
- resp->wc[i].wr_id = wc[i].wr_id;
- resp->wc[i].status = wc[i].status;
- resp->wc[i].opcode = wc[i].opcode;
- resp->wc[i].vendor_err = wc[i].vendor_err;
- resp->wc[i].byte_len = wc[i].byte_len;
- resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data;
- resp->wc[i].qp_num = wc[i].qp->qp_num;
- resp->wc[i].src_qp = wc[i].src_qp;
- resp->wc[i].wc_flags = wc[i].wc_flags;
- resp->wc[i].pkey_index = wc[i].pkey_index;
- resp->wc[i].slid = wc[i].slid;
- resp->wc[i].sl = wc[i].sl;
- resp->wc[i].dlid_path_bits = wc[i].dlid_path_bits;
- resp->wc[i].port_num = wc[i].port_num;
- }
-
- if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, rsize))
+ if (copy_to_user(header_ptr, &resp, sizeof resp)) {
ret = -EFAULT;
+ goto out_put;
+ }
-out:
- kfree(resp);
+ ret = in_len;
-out_wc:
- kfree(wc);
- return ret ? ret : in_len;
+out_put:
+ put_cq_read(cq);
+ return ret;
}
ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
@@ -1035,124 +1574,181 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
const char __user *buf, int in_len,
int out_len)
{
- struct ib_uverbs_create_qp cmd;
- struct ib_uverbs_create_qp_resp resp;
+ void __user *response;
struct ib_udata udata;
struct ib_uqp_object *obj;
- struct ib_pd *pd;
- struct ib_cq *scq, *rcq;
- struct ib_srq *srq;
+ struct ib_device *device;
+ struct ib_pd *pd = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_cq *scq = NULL, *rcq = NULL;
+ struct ib_srq *srq = NULL;
struct ib_qp *qp;
struct ib_qp_init_attr attr;
- struct ib_xrcd *xrcd;
- struct ib_uobject *xrcd_uobj;
int ret;
-
- if (out_len < sizeof resp)
+ union {
+ struct ib_uverbs_create_qp basic;
+ } cmd_obj;
+ struct ib_uverbs_create_qp *cmd;
+ size_t cmd_size = 0;
+ union {
+ struct ib_uverbs_create_qp_resp basic;
+ } resp_obj;
+ struct ib_uverbs_create_qp_resp *resp;
+ size_t resp_size = 0;
+
+ cmd_size = sizeof(cmd_obj.basic);
+ cmd = &cmd_obj.basic;
+
+ resp_size = sizeof(resp_obj.basic);
+ resp = &resp_obj.basic;
+
+ if (out_len < resp_size)
return -ENOSPC;
- if (copy_from_user(&cmd, buf, sizeof cmd))
+ if (copy_from_user(&cmd_obj, buf, cmd_size))
return -EFAULT;
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ response = (void __user *)cmd->response;
- obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!disable_raw_qp_enforcement &&
+ cmd->qp_type == IB_QPT_RAW_PACKET && !priv_check(curthread, PRIV_NET_RAW))
+ return -EPERM;
+
+ INIT_UDATA(&udata, buf + cmd_size, response + resp_size,
+ in_len - cmd_size, out_len - resp_size);
+
+ obj = kzalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
return -ENOMEM;
- init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key);
+ init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class);
down_write(&obj->uevent.uobject.mutex);
- srq = (cmd.is_srq && cmd.qp_type != IB_QPT_XRC) ?
- idr_read_srq(cmd.srq_handle, file->ucontext) : NULL;
- xrcd = cmd.qp_type == IB_QPT_XRC ?
- idr_read_xrcd(cmd.srq_handle, file->ucontext, &xrcd_uobj) : NULL;
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
- scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0);
- rcq = cmd.recv_cq_handle == cmd.send_cq_handle ?
- scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1);
+ if (cmd->qp_type == IB_QPT_XRC_TGT) {
+ xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, &xrcd_uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ device = xrcd->device;
+ } else {
+ if (cmd->qp_type == IB_QPT_XRC_INI) {
+ cmd->max_recv_wr = 0;
+ cmd->max_recv_sge = 0;
+ } else {
+ if (cmd->is_srq) {
+ srq = idr_read_srq(cmd->srq_handle, file->ucontext);
+ if (!srq || srq->srq_type != IB_SRQT_BASIC) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
- if (!pd || !scq || !rcq || (cmd.is_srq && !srq) ||
- (cmd.qp_type == IB_QPT_XRC && !xrcd)) {
- ret = -EINVAL;
- goto err_put;
+ if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+ rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0);
+ if (!rcq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+ }
+
+ scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
+ rcq = rcq ?: scq;
+ pd = idr_read_pd(cmd->pd_handle, file->ucontext);
+ if (!pd || !scq) {
+ ret = -EINVAL;
+ goto err_put;
}
- attr.create_flags = 0;
+ device = pd->device;
+ }
+
+ memset(&attr, 0, sizeof attr);
attr.event_handler = ib_uverbs_qp_event_handler;
attr.qp_context = file;
attr.send_cq = scq;
attr.recv_cq = rcq;
attr.srq = srq;
- attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
- attr.qp_type = cmd.qp_type;
- attr.xrcd = xrcd;
+ attr.xrcd = xrcd;
+ attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+ attr.qp_type = cmd->qp_type;
attr.create_flags = 0;
- attr.cap.max_send_wr = cmd.max_send_wr;
- attr.cap.max_recv_wr = cmd.max_recv_wr;
- attr.cap.max_send_sge = cmd.max_send_sge;
- attr.cap.max_recv_sge = cmd.max_recv_sge;
- attr.cap.max_inline_data = cmd.max_inline_data;
+ attr.cap.max_send_wr = cmd->max_send_wr;
+ attr.cap.max_recv_wr = cmd->max_recv_wr;
+ attr.cap.max_send_sge = cmd->max_send_sge;
+ attr.cap.max_recv_sge = cmd->max_recv_sge;
+ attr.cap.max_inline_data = cmd->max_inline_data;
obj->uevent.events_reported = 0;
INIT_LIST_HEAD(&obj->uevent.event_list);
INIT_LIST_HEAD(&obj->mcast_list);
- qp = pd->device->create_qp(pd, &attr, &udata);
+ if (cmd->qp_type == IB_QPT_XRC_TGT)
+ qp = ib_create_qp(pd, &attr);
+ else
+ qp = device->create_qp(pd, &attr, &udata);
+
if (IS_ERR(qp)) {
ret = PTR_ERR(qp);
goto err_put;
}
- qp->device = pd->device;
- qp->pd = pd;
- qp->send_cq = attr.send_cq;
- qp->recv_cq = attr.recv_cq;
- qp->srq = attr.srq;
- qp->uobject = &obj->uevent.uobject;
- qp->event_handler = attr.event_handler;
- qp->qp_context = attr.qp_context;
- qp->qp_type = attr.qp_type;
- qp->xrcd = attr.xrcd;
- atomic_inc(&pd->usecnt);
- atomic_inc(&attr.send_cq->usecnt);
- atomic_inc(&attr.recv_cq->usecnt);
- if (attr.srq)
- atomic_inc(&attr.srq->usecnt);
- else if (attr.xrcd)
- atomic_inc(&attr.xrcd->usecnt);
+ if (cmd->qp_type != IB_QPT_XRC_TGT) {
+ qp->real_qp = qp;
+ qp->device = device;
+ qp->pd = pd;
+ qp->send_cq = attr.send_cq;
+ qp->recv_cq = attr.recv_cq;
+ qp->srq = attr.srq;
+ qp->event_handler = attr.event_handler;
+ qp->qp_context = attr.qp_context;
+ qp->qp_type = attr.qp_type;
+ atomic_set(&qp->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&attr.send_cq->usecnt);
+ if (attr.recv_cq)
+ atomic_inc(&attr.recv_cq->usecnt);
+ if (attr.srq)
+ atomic_inc(&attr.srq->usecnt);
+ }
+ qp->uobject = &obj->uevent.uobject;
obj->uevent.uobject.object = qp;
ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
if (ret)
goto err_destroy;
- memset(&resp, 0, sizeof resp);
- resp.qpn = qp->qp_num;
- resp.qp_handle = obj->uevent.uobject.id;
- resp.max_recv_sge = attr.cap.max_recv_sge;
- resp.max_send_sge = attr.cap.max_send_sge;
- resp.max_recv_wr = attr.cap.max_recv_wr;
- resp.max_send_wr = attr.cap.max_send_wr;
- resp.max_inline_data = attr.cap.max_inline_data;
+ memset(&resp_obj, 0, sizeof(resp_obj));
+ resp->qpn = qp->qp_num;
+ resp->qp_handle = obj->uevent.uobject.id;
+ resp->max_recv_sge = attr.cap.max_recv_sge;
+ resp->max_send_sge = attr.cap.max_send_sge;
+ resp->max_recv_wr = attr.cap.max_recv_wr;
+ resp->max_send_wr = attr.cap.max_send_wr;
+ resp->max_inline_data = attr.cap.max_inline_data;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
- ret = -EFAULT;
+ if (copy_to_user(response, &resp_obj, resp_size)) {
+ ret = -EFAULT;
goto err_copy;
- }
+ }
- put_pd_read(pd);
- put_cq_read(scq);
- if (rcq != scq)
+ if (xrcd) {
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ put_xrcd_read(xrcd_uobj);
+ }
+
+ if (pd)
+ put_pd_read(pd);
+ if (scq)
+ put_cq_read(scq);
+ if (rcq && rcq != scq)
put_cq_read(rcq);
if (srq)
put_srq_read(srq);
- if (xrcd)
- put_xrcd_read(xrcd_uobj);
mutex_lock(&file->mutex);
list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
@@ -1171,6 +1767,8 @@ err_destroy:
ib_destroy_qp(qp);
err_put:
+ if (xrcd)
+ put_xrcd_read(xrcd_uobj);
if (pd)
put_pd_read(pd);
if (scq)
@@ -1179,16 +1777,107 @@ err_put:
put_cq_read(rcq);
if (srq)
put_srq_read(srq);
- if (xrcd)
- put_xrcd_read(xrcd_uobj);
put_uobj_write(&obj->uevent.uobject);
return ret;
}
+ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_open_qp cmd;
+ struct ib_uverbs_create_qp_resp resp;
+ struct ib_udata udata;
+ struct ib_uqp_object *obj;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_qp *qp;
+ struct ib_qp_open_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
+
+ xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_context = file;
+ attr.qp_num = cmd.qpn;
+ attr.qp_type = cmd.qp_type;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ qp = ib_open_qp(xrcd, &attr);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
+
+ qp->uobject = &obj->uevent.uobject;
+
+ obj->uevent.uobject.object = qp;
+ ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.qpn = qp->qp_num;
+ resp.qp_handle = obj->uevent.uobject.id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_remove;
+ }
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ put_xrcd_read(xrcd_uobj);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uevent.uobject.live = 1;
+ up_write(&obj->uevent.uobject.mutex);
+
+ return in_len;
+
+err_remove:
+ idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+ ib_destroy_qp(qp);
+
+err_put:
+ put_xrcd_read(xrcd_uobj);
+ put_uobj_write(&obj->uevent.uobject);
+ return ret;
+}
+
ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_query_qp cmd;
struct ib_uverbs_query_qp_resp resp;
@@ -1286,30 +1975,59 @@ out:
return ret ? ret : in_len;
}
-ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
{
- struct ib_uverbs_modify_qp cmd;
- struct ib_udata udata;
- struct ib_qp *qp;
- struct ib_qp_attr *attr;
- int ret;
+ switch (qp_type) {
+ case IB_QPT_XRC_INI:
+ return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+ case IB_QPT_XRC_TGT:
+ return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY);
+ default:
+ return mask;
+ }
+}
- if (copy_from_user(&cmd, buf, sizeof cmd))
+static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len,
+ enum uverbs_cmd_type cmd_type)
+{
+ struct ib_uverbs_modify_qp_ex cmd;
+ struct ib_udata udata;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ struct ib_qp_attr_ex *attrx;
+ int ret;
+ void *p;
+ union ib_gid sgid;
+ union ib_gid *dgid;
+ u8 port_num;
+
+ if (cmd_type == IB_USER_VERBS_CMD_BASIC) {
+ p = &cmd;
+ p += sizeof(cmd.comp_mask);
+ if (copy_from_user(p, buf,
+ sizeof(struct ib_uverbs_modify_qp)))
return -EFAULT;
+ } else {
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+ }
INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
out_len);
- attr = kmalloc(sizeof *attr, GFP_KERNEL);
- if (!attr)
+ attrx = kzalloc(sizeof(*attrx), GFP_KERNEL);
+ if (!attrx)
return -ENOMEM;
+ attr = (struct ib_qp_attr *)attrx;
qp = idr_read_qp(cmd.qp_handle, file->ucontext);
if (!qp) {
- ret = -EINVAL;
- goto out;
+ kfree(attrx);
+ return -EINVAL;
}
attr->qp_state = cmd.qp_state;
@@ -1357,10 +2075,49 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate;
attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
attr->alt_ah_attr.port_num = cmd.alt_dest.port_num;
+ port_num = (cmd.attr_mask & IB_QP_PORT) ? cmd.port_num : qp->port_num;
+ if ((cmd.attr_mask & IB_QP_AV) && port_num &&
+ (rdma_port_get_link_layer(qp->device, port_num) ==
+ IB_LINK_LAYER_ETHERNET)) {
+ ret = ib_query_gid(qp->device, port_num,
+ attr->ah_attr.grh.sgid_index, &sgid);
+ if (ret)
+ goto out;
+ dgid = &attr->ah_attr.grh.dgid;
+ if (rdma_link_local_addr((struct in6_addr *)dgid->raw)) {
+ rdma_get_ll_mac((struct in6_addr *)dgid->raw,
+ attr->ah_attr.dmac);
+ rdma_get_ll_mac((struct in6_addr *)sgid.raw,
+ attr->smac);
+ attr->vlan_id = rdma_get_vlan_id(&sgid);
+ } else {
+ ret = rdma_addr_find_dmac_by_grh(&sgid, dgid,
+ attr->ah_attr.dmac,
+ &attr->vlan_id);
+ if (ret)
+ goto out;
+ ret = rdma_addr_find_smac_by_sgid(&sgid, attr->smac,
+ NULL);
+ if (ret)
+ goto out;
+ }
+ cmd.attr_mask |= IB_QP_SMAC;
+ if (attr->vlan_id < 0xFFFF)
+ cmd.attr_mask |= IB_QP_VID;
+ }
+ if (cmd_type == IB_USER_VERBS_CMD_EXTENDED) {
+ if (cmd.comp_mask & IB_UVERBS_QP_ATTR_DCT_KEY)
+ attrx->dct_key = cmd.dct_key;
+ }
- ret = qp->device->modify_qp(qp, attr, cmd.attr_mask, &udata);
-
- put_qp_read(qp);
+ if (qp->real_qp == qp) {
+ ret = qp->device->modify_qp(qp, attr,
+ modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata);
+ if (!ret && (cmd.attr_mask & IB_QP_PORT))
+ qp->port_num = attr->port_num;
+ } else {
+ ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask));
+ }
if (ret)
goto out;
@@ -1368,18 +2125,27 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
ret = in_len;
out:
- kfree(attr);
+ put_qp_read(qp);
+ kfree(attrx);
return ret;
}
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ return __uverbs_modify_qp(file, buf, in_len, out_len,
+ IB_USER_VERBS_CMD_BASIC);
+}
+
ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
const char __user *buf, int in_len,
int out_len)
{
struct ib_uverbs_destroy_qp cmd;
struct ib_uverbs_destroy_qp_resp resp;
- struct ib_uobject *uobj;
+ struct ib_uobject *uobj;
struct ib_qp *qp;
struct ib_uqp_object *obj;
int ret = -EINVAL;
@@ -1409,6 +2175,9 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
if (ret)
return ret;
+ if (obj->uxrcd)
+ atomic_dec(&obj->uxrcd->refcnt);
+
idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
mutex_lock(&file->mutex);
@@ -1429,14 +2198,14 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_post_send cmd;
struct ib_uverbs_post_send_resp resp;
struct ib_uverbs_send_wr *user_wr;
struct ib_send_wr *wr = NULL, *last, *next, *bad_wr;
- struct ib_qp *qp;
+ struct ib_qp *qp;
int i, sg_ind;
int is_ud;
ssize_t ret = -EINVAL;
@@ -1479,13 +2248,13 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
user_wr->num_sge * sizeof (struct ib_sge),
GFP_KERNEL);
if (!next) {
- ret = -ENOMEM;
- goto out_put;
- }
+ ret = -ENOMEM;
+ goto out_put;
+ }
if (!last)
wr = next;
- else
+ else
last->next = next;
last = next;
@@ -1500,7 +2269,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
file->ucontext);
if (!next->wr.ud.ah) {
ret = -EINVAL;
- goto out_put;
+ goto out_put;
}
next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn;
next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
@@ -1555,12 +2324,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
}
resp.bad_wr = 0;
- ret = qp->device->post_send(qp, wr, &bad_wr);
+ ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
if (ret)
for (next = wr; next; next = next->next) {
++resp.bad_wr;
if (next == bad_wr)
- break;
+ break;
}
if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -1594,7 +2363,7 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
struct ib_recv_wr *wr = NULL, *last, *next;
int sg_ind;
int i;
- int ret;
+ int ret;
if (in_len < wqe_size * wr_count +
sge_count * sizeof (struct ib_uverbs_sge))
@@ -1617,9 +2386,9 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
}
if (user_wr->num_sge + sg_ind > sge_count) {
- ret = -EINVAL;
- goto err;
- }
+ ret = -EINVAL;
+ goto err;
+ }
next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
user_wr->num_sge * sizeof (struct ib_sge),
@@ -1627,7 +2396,7 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
if (!next) {
ret = -ENOMEM;
goto err;
- }
+ }
if (!last)
wr = next;
@@ -1693,7 +2462,7 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
goto out;
resp.bad_wr = 0;
- ret = qp->device->post_recv(qp, wr, &bad_wr);
+ ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
put_qp_read(qp);
@@ -1768,8 +2537,8 @@ out:
}
ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_create_ah cmd;
struct ib_uverbs_create_ah_resp resp;
@@ -1789,10 +2558,10 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
if (!uobj)
return -ENOMEM;
- init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_key);
+ init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class);
down_write(&uobj->mutex);
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
if (!pd) {
ret = -EINVAL;
goto err;
@@ -1863,7 +2632,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
struct ib_uverbs_destroy_ah cmd;
struct ib_ah *ah;
struct ib_uobject *uobj;
- int ret;
+ int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -1906,7 +2675,7 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ qp = idr_write_qp(cmd.qp_handle, file->ucontext);
if (!qp)
return -EINVAL;
@@ -1935,25 +2704,25 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
kfree(mcast);
out_put:
- put_qp_read(qp);
+ put_qp_write(qp);
return ret ? ret : in_len;
}
ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_detach_mcast cmd;
struct ib_uqp_object *obj;
struct ib_qp *qp;
struct ib_uverbs_mcast_entry *mcast;
- int ret = -EINVAL;
+ int ret = -EINVAL;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ qp = idr_write_qp(cmd.qp_handle, file->ucontext);
if (!qp)
return -EINVAL;
@@ -1972,102 +2741,122 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
}
out_put:
- put_qp_read(qp);
+ put_qp_write(qp);
return ret ? ret : in_len;
}
-ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_uverbs_create_xsrq *cmd,
+ struct ib_udata *udata)
{
- struct ib_uverbs_create_srq cmd;
struct ib_uverbs_create_srq_resp resp;
- struct ib_udata udata;
- struct ib_uevent_object *obj;
+ struct ib_usrq_object *obj;
struct ib_pd *pd;
struct ib_srq *srq;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
struct ib_srq_init_attr attr;
int ret;
- if (out_len < sizeof resp)
- return -ENOSPC;
+ obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj);
+ if (!attr.ext.xrc.xrcd) {
+ ret = -EINVAL;
+ goto err;
+ }
- obj = kmalloc(sizeof *obj, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
- init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key);
- down_write(&obj->uobject.mutex);
+ attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0);
+ if (!attr.ext.xrc.cq) {
+ ret = -EINVAL;
+ goto err_put_xrcd;
+ }
+ }
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ pd = idr_read_pd(cmd->pd_handle, file->ucontext);
if (!pd) {
ret = -EINVAL;
- goto err;
- }
+ goto err_put_cq;
+ }
attr.event_handler = ib_uverbs_srq_event_handler;
attr.srq_context = file;
- attr.attr.max_wr = cmd.max_wr;
- attr.attr.max_sge = cmd.max_sge;
- attr.attr.srq_limit = cmd.srq_limit;
+ attr.srq_type = cmd->srq_type;
+ attr.attr.max_wr = cmd->max_wr;
+ attr.attr.max_sge = cmd->max_sge;
+ attr.attr.srq_limit = cmd->srq_limit;
- obj->events_reported = 0;
- INIT_LIST_HEAD(&obj->event_list);
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
- srq = pd->device->create_srq(pd, &attr, &udata);
+ srq = pd->device->create_srq(pd, &attr, udata);
if (IS_ERR(srq)) {
ret = PTR_ERR(srq);
goto err_put;
}
- srq->device = pd->device;
- srq->pd = pd;
- srq->uobject = &obj->uobject;
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->srq_type = cmd->srq_type;
+ srq->uobject = &obj->uevent.uobject;
srq->event_handler = attr.event_handler;
srq->srq_context = attr.srq_context;
- srq->ext.xrc.cq = NULL;
- srq->ext.xrc.xrcd = NULL;
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.cq = attr.ext.xrc.cq;
+ srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+ atomic_inc(&attr.ext.xrc.cq->usecnt);
+ atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+ }
+
atomic_inc(&pd->usecnt);
atomic_set(&srq->usecnt, 0);
- obj->uobject.object = srq;
- ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+ obj->uevent.uobject.object = srq;
+ ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
if (ret)
goto err_destroy;
memset(&resp, 0, sizeof resp);
- resp.srq_handle = obj->uobject.id;
+ resp.srq_handle = obj->uevent.uobject.id;
resp.max_wr = attr.attr.max_wr;
resp.max_sge = attr.attr.max_sge;
+ if (cmd->srq_type == IB_SRQT_XRC)
+ resp.srqn = srq->ext.xrc.srq_num;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ if (copy_to_user((void __user *) (unsigned long) cmd->response,
&resp, sizeof resp)) {
ret = -EFAULT;
goto err_copy;
}
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ put_uobj_read(xrcd_uobj);
+ put_cq_read(attr.ext.xrc.cq);
+ }
put_pd_read(pd);
mutex_lock(&file->mutex);
- list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list);
mutex_unlock(&file->mutex);
- obj->uobject.live = 1;
+ obj->uevent.uobject.live = 1;
- up_write(&obj->uobject.mutex);
+ up_write(&obj->uevent.uobject.mutex);
- return in_len;
+ return 0;
err_copy:
- idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+ idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
err_destroy:
ib_destroy_srq(srq);
@@ -2075,25 +2864,29 @@ err_destroy:
err_put:
put_pd_read(pd);
+err_put_cq:
+ if (cmd->srq_type == IB_SRQT_XRC)
+ put_cq_read(attr.ext.xrc.cq);
+
+err_put_xrcd:
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ atomic_dec(&obj->uxrcd->refcnt);
+ put_uobj_read(xrcd_uobj);
+ }
+
err:
- put_uobj_write(&obj->uobject);
+ put_uobj_write(&obj->uevent.uobject);
return ret;
}
-ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
{
- struct ib_uverbs_create_xsrq cmd;
+ struct ib_uverbs_create_srq cmd;
+ struct ib_uverbs_create_xsrq xcmd;
struct ib_uverbs_create_srq_resp resp;
- struct ib_udata udata;
- struct ib_uevent_object *obj;
- struct ib_pd *pd;
- struct ib_srq *srq;
- struct ib_cq *xrc_cq;
- struct ib_xrcd *xrcd;
- struct ib_srq_init_attr attr;
- struct ib_uobject *xrcd_uobj;
+ struct ib_udata udata;
int ret;
if (out_len < sizeof resp)
@@ -2102,113 +2895,48 @@ ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
+ xcmd.response = cmd.response;
+ xcmd.user_handle = cmd.user_handle;
+ xcmd.srq_type = IB_SRQT_BASIC;
+ xcmd.pd_handle = cmd.pd_handle;
+ xcmd.max_wr = cmd.max_wr;
+ xcmd.max_sge = cmd.max_sge;
+ xcmd.srq_limit = cmd.srq_limit;
+
INIT_UDATA(&udata, buf + sizeof cmd,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- obj = kmalloc(sizeof *obj, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
-
- init_uobj(&obj->uobject, cmd.user_handle, file->ucontext,
- &srq_lock_key);
- down_write(&obj->uobject.mutex);
-
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
- if (!pd) {
- ret = -EINVAL;
- goto err;
- }
-
- xrc_cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
- if (!xrc_cq) {
- ret = -EINVAL;
- goto err_put_pd;
- }
-
- xrcd = idr_read_xrcd(cmd.xrcd_handle, file->ucontext, &xrcd_uobj);
- if (!xrcd) {
- ret = -EINVAL;
- goto err_put_cq;
- }
-
-
- attr.event_handler = ib_uverbs_srq_event_handler;
- attr.srq_context = file;
- attr.attr.max_wr = cmd.max_wr;
- attr.attr.max_sge = cmd.max_sge;
- attr.attr.srq_limit = cmd.srq_limit;
-
- obj->events_reported = 0;
- INIT_LIST_HEAD(&obj->event_list);
-
- srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, &attr, &udata);
- if (IS_ERR(srq)) {
- ret = PTR_ERR(srq);
- goto err_put;
- }
-
- srq->device = pd->device;
- srq->pd = pd;
- srq->uobject = &obj->uobject;
- srq->event_handler = attr.event_handler;
- srq->srq_context = attr.srq_context;
- srq->ext.xrc.cq = xrc_cq;
- srq->ext.xrc.xrcd = xrcd;
- atomic_inc(&pd->usecnt);
- atomic_inc(&xrc_cq->usecnt);
- atomic_inc(&xrcd->usecnt);
-
- atomic_set(&srq->usecnt, 0);
-
- obj->uobject.object = srq;
- ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+ ret = __uverbs_create_xsrq(file, &xcmd, &udata);
if (ret)
- goto err_destroy;
-
- memset(&resp, 0, sizeof resp);
- resp.srq_handle = obj->uobject.id;
- resp.max_wr = attr.attr.max_wr;
- resp.max_sge = attr.attr.max_sge;
-
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
- ret = -EFAULT;
- goto err_copy;
- }
-
- put_xrcd_read(xrcd_uobj);
- put_cq_read(xrc_cq);
- put_pd_read(pd);
-
- mutex_lock(&file->mutex);
- list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
- mutex_unlock(&file->mutex);
-
- obj->uobject.live = 1;
-
- up_write(&obj->uobject.mutex);
+ return ret;
return in_len;
+}
-err_copy:
- idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_create_xsrq cmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ int ret;
-err_destroy:
- ib_destroy_srq(srq);
+ if (out_len < sizeof resp)
+ return -ENOSPC;
-err_put:
- put_xrcd_read(xrcd_uobj);
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
-err_put_cq:
- put_cq_read(xrc_cq);
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
-err_put_pd:
- put_pd_read(pd);
+ ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ if (ret)
+ return ret;
-err:
- put_uobj_write(&obj->uobject);
- return ret;
+ return in_len;
}
ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
@@ -2266,7 +2994,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
put_srq_read(srq);
if (ret)
- return ret;
+ return ret;
memset(&resp, 0, sizeof resp);
@@ -2282,8 +3010,8 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_destroy_srq cmd;
struct ib_uverbs_destroy_srq_resp resp;
@@ -2291,6 +3019,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
struct ib_srq *srq;
struct ib_uevent_object *obj;
int ret = -EINVAL;
+ struct ib_usrq_object *us;
+ enum ib_srq_type srq_type;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -2300,6 +3030,7 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
return -EINVAL;
srq = uobj->object;
obj = container_of(uobj, struct ib_uevent_object, uobject);
+ srq_type = srq->srq_type;
ret = ib_destroy_srq(srq);
if (!ret)
@@ -2310,6 +3041,11 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
if (ret)
return ret;
+ if (srq_type == IB_SRQT_XRC) {
+ us = container_of(obj, struct ib_usrq_object, uevent);
+ atomic_dec(&us->uxrcd->refcnt);
+ }
+
idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
mutex_lock(&file->mutex);
@@ -2330,313 +3066,467 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
return ret ? ret : in_len;
}
-static struct inode *xrc_file2inode(struct file *f)
+ssize_t ib_uverbs_exp_create_dct(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
{
- return f->f_dentry->d_inode;
-}
+ int in_len = ucore->inlen + uhw->inlen;
+ int out_len = ucore->outlen + uhw->outlen;
+ struct ib_uverbs_create_dct cmd;
+ struct ib_uverbs_create_dct_resp resp;
+ struct ib_udata udata;
+ struct ib_udct_object *obj;
+ struct ib_dct *dct;
+ int ret;
+ struct ib_dct_init_attr attr;
+ struct ib_pd *pd = NULL;
+ struct ib_cq *cq = NULL;
+ struct ib_srq *srq = NULL;
-struct xrcd_table_entry {
- struct rb_node node;
- struct inode *inode;
- struct ib_xrcd *xrcd;
-};
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
-static int xrcd_table_insert(struct ib_device *dev,
- struct inode *i_n,
- struct ib_xrcd *xrcd)
-{
- struct xrcd_table_entry *entry, *scan;
- struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
- struct rb_node *parent = NULL;
+ ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd));
+ if (ret)
+ return ret;
- entry = kmalloc(sizeof(struct xrcd_table_entry), GFP_KERNEL);
- if (!entry)
+ obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
return -ENOMEM;
- entry->inode = i_n;
- entry->xrcd = xrcd;
+ init_uobj(&obj->uobject, cmd.user_handle, file->ucontext,
+ &dct_lock_class);
+ down_write(&obj->uobject.mutex);
- while (*p) {
- parent = *p;
- scan = rb_entry(parent, struct xrcd_table_entry, node);
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_pd;
+ }
- if (i_n < scan->inode)
- p = &(*p)->rb_left;
- else if (i_n > scan->inode)
- p = &(*p)->rb_right;
- else {
- kfree(entry);
- return -EEXIST;
- }
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq) {
+ ret = -EINVAL;
+ goto err_put;
}
- rb_link_node(&entry->node, parent, p);
- rb_insert_color(&entry->node, &dev->ib_uverbs_xrcd_table);
- igrab(i_n);
- return 0;
-}
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
-static struct xrcd_table_entry *xrcd_table_search(struct ib_device *dev,
- struct inode *i_n)
-{
- struct xrcd_table_entry *scan;
- struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
- struct rb_node *parent = NULL;
+ attr.cq = cq;
+ attr.access_flags = cmd.access_flags;
+ attr.min_rnr_timer = cmd.min_rnr_timer;
+ attr.srq = srq;
+ attr.tclass = cmd.tclass;
+ attr.flow_label = cmd.flow_label;
+ attr.dc_key = cmd.dc_key;
+ attr.mtu = cmd.mtu;
+ attr.port = cmd.port;
+ attr.pkey_index = cmd.pkey_index;
+ attr.gid_index = cmd.gid_index;
+ attr.hop_limit = cmd.hop_limit;
+ attr.create_flags = cmd.create_flags;
+
+ dct = ib_create_dct(pd, &attr, &udata);
+ if (IS_ERR(dct)) {
+ ret = PTR_ERR(dct);
+ goto err_put;
+ }
- while (*p) {
- parent = *p;
- scan = rb_entry(parent, struct xrcd_table_entry, node);
+ dct->device = file->device->ib_dev;
+ dct->uobject = &obj->uobject;
- if (i_n < scan->inode)
- p = &(*p)->rb_left;
- else if (i_n > scan->inode)
- p = &(*p)->rb_right;
- else
- return scan;
- }
- return NULL;
-}
+ obj->uobject.object = dct;
+ ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uobject);
+ if (ret)
+ goto err_dct;
-static int find_xrcd(struct ib_device *dev, struct inode *i_n,
- struct ib_xrcd **xrcd)
-{
- struct xrcd_table_entry *entry;
+ memset(&resp, 0, sizeof(resp));
+ resp.dct_handle = obj->uobject.id;
+ resp.dctn = dct->dct_num;
- entry = xrcd_table_search(dev, i_n);
- if (!entry)
- return -EINVAL;
+ ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp));
+ if (ret)
+ goto err_copy;
- *xrcd = entry->xrcd;
- return 0;
-}
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uobject.list, &file->ucontext->dct_list);
+ mutex_unlock(&file->mutex);
+ obj->uobject.live = 1;
-static void xrcd_table_delete(struct ib_device *dev,
- struct inode *i_n)
-{
- struct xrcd_table_entry *entry = xrcd_table_search(dev, i_n);
+ put_srq_read(srq);
+ put_cq_read(cq);
+ put_pd_read(pd);
- if (entry) {
- iput(i_n);
- rb_erase(&entry->node, &dev->ib_uverbs_xrcd_table);
- kfree(entry);
- }
+ up_write(&obj->uobject.mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uobject);
+
+err_dct:
+ ib_destroy_dct(dct);
+
+err_put:
+ if (srq)
+ put_srq_read(srq);
+
+ if (cq)
+ put_cq_read(cq);
+
+ put_pd_read(pd);
+
+err_pd:
+ put_uobj_write(&obj->uobject);
+ return ret;
}
-ssize_t ib_uverbs_open_xrc_domain(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
{
- struct ib_uverbs_open_xrc_domain cmd;
- struct ib_uverbs_open_xrc_domain_resp resp;
- struct ib_udata udata;
- struct ib_uobject *uobj;
- struct ib_uxrcd_object *xrcd_uobj;
- struct ib_xrcd *xrcd = NULL;
- struct file *f = NULL;
- struct inode *inode = NULL;
- int ret = 0;
- int new_xrcd = 0;
+ int in_len = ucore->inlen + uhw->inlen;
+ int out_len = ucore->outlen + uhw->outlen;
+ struct ib_uverbs_destroy_dct cmd;
+ struct ib_uverbs_destroy_dct_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_dct *dct;
+ struct ib_udct_object *obj;
+ int ret;
- if (out_len < sizeof resp)
+ if (out_len < sizeof(resp))
return -ENOSPC;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd));
+ if (ret)
+ return ret;
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.user_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
- mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
- if (cmd.fd != (u32) (-1)) {
- /* search for file descriptor */
- f = fget(cmd.fd);
- if (!f) {
- ret = -EBADF;
- goto err_table_mutex_unlock;
- }
+ dct = uobj->object;
+ obj = container_of(dct->uobject, struct ib_udct_object, uobject);
- inode = xrc_file2inode(f);
- if (!inode) {
- ret = -EBADF;
- goto err_table_mutex_unlock;
- }
+ ret = ib_destroy_dct(dct);
+ if (!ret)
+ uobj->live = 0;
- ret = find_xrcd(file->device->ib_dev, inode, &xrcd);
- if (ret && !(cmd.oflags & O_CREAT)) {
- /* no file descriptor. Need CREATE flag */
- ret = -EAGAIN;
- goto err_table_mutex_unlock;
- }
+ put_uobj_write(uobj);
- if (xrcd && cmd.oflags & O_EXCL) {
- ret = -EINVAL;
- goto err_table_mutex_unlock;
- }
- }
+ if (ret)
+ return ret;
- xrcd_uobj = kmalloc(sizeof *xrcd_uobj, GFP_KERNEL);
- if (!xrcd_uobj) {
- ret = -ENOMEM;
- goto err_table_mutex_unlock;
- }
+ idr_remove_uobj(&ib_uverbs_dct_idr, uobj);
- uobj = &xrcd_uobj->uobject;
- init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
- down_write(&uobj->mutex);
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
- if (!xrcd) {
- xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
- file->ucontext, &udata);
- if (IS_ERR(xrcd)) {
- ret = PTR_ERR(xrcd);
- goto err;
- }
- xrcd->uobject = (cmd.fd == -1) ? uobj : NULL;
- xrcd->inode = inode;
- xrcd->device = file->device->ib_dev;
- atomic_set(&xrcd->usecnt, 0);
- new_xrcd = 1;
- }
+ memset(&resp, 0, sizeof(resp));
- uobj->object = xrcd;
- ret = idr_add_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+ put_uobj(uobj);
+
+ ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp));
if (ret)
- goto err_idr;
+ return ret;
- memset(&resp, 0, sizeof resp);
- resp.xrcd_handle = uobj->id;
+ return in_len;
+}
- if (inode) {
- if (new_xrcd) {
- /* create new inode/xrcd table entry */
- ret = xrcd_table_insert(file->device->ib_dev, inode, xrcd);
- if (ret)
- goto err_insert_xrcd;
- }
- atomic_inc(&xrcd->usecnt);
+ssize_t ib_uverbs_exp_query_dct(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ int in_len = ucore->inlen + uhw->inlen;
+ int out_len = ucore->outlen + uhw->outlen;
+ struct ib_uverbs_query_dct cmd;
+ struct ib_uverbs_query_dct_resp resp;
+ struct ib_dct *dct;
+ struct ib_dct_attr *attr;
+ int err;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ err = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ err = -ENOMEM;
+ goto out;
}
- if (f)
- fput(f);
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
- ret = -EFAULT;
- goto err_copy;
+ dct = idr_read_dct(cmd.dct_handle, file->ucontext);
+ if (!dct) {
+ err = -EINVAL;
+ goto out;
}
- INIT_LIST_HEAD(&xrcd_uobj->xrc_reg_qp_list);
+ err = ib_query_dct(dct, attr);
- mutex_lock(&file->mutex);
- list_add_tail(&uobj->list, &file->ucontext->xrcd_list);
- mutex_unlock(&file->mutex);
+ put_dct_read(dct);
- uobj->live = 1;
+ if (err)
+ goto out;
- up_write(&uobj->mutex);
+ memset(&resp, 0, sizeof(resp));
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
- return in_len;
+ resp.dc_key = attr->dc_key;
+ resp.access_flags = attr->access_flags;
+ resp.flow_label = attr->flow_label;
+ resp.key_violations = attr->key_violations;
+ resp.port = attr->port;
+ resp.min_rnr_timer = attr->min_rnr_timer;
+ resp.tclass = attr->tclass;
+ resp.mtu = attr->mtu;
+ resp.pkey_index = attr->pkey_index;
+ resp.gid_index = attr->gid_index;
+ resp.hop_limit = attr->hop_limit;
+ resp.state = attr->state;
-err_copy:
+ err = ucore->ops->copy_to(ucore, &resp, sizeof(resp));
- if (inode) {
- if (new_xrcd)
- xrcd_table_delete(file->device->ib_dev, inode);
- atomic_dec(&xrcd->usecnt);
+out:
+ kfree(attr);
+
+ return err ? err : in_len;
+}
+
+/*
+ * Experimental functions
+ */
+
+static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec)
+{
+ ib_spec->type = kern_spec->type;
+
+ switch (ib_spec->type) {
+ case IB_FLOW_SPEC_ETH:
+ ib_spec->eth.size = sizeof(struct ib_flow_spec_eth);
+ memcpy(&ib_spec->eth.val, &kern_spec->eth.val,
+ sizeof(struct ib_flow_eth_filter));
+ memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask,
+ sizeof(struct ib_flow_eth_filter));
+ break;
+ case IB_FLOW_SPEC_IB:
+ ib_spec->ib.size = sizeof(struct ib_flow_spec_ib);
+ memcpy(&ib_spec->ib.val, &kern_spec->ib.val,
+ sizeof(struct ib_flow_ib_filter));
+ memcpy(&ib_spec->ib.mask, &kern_spec->ib.mask,
+ sizeof(struct ib_flow_ib_filter));
+ break;
+ case IB_FLOW_SPEC_IPV4:
+ ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4);
+ memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val,
+ sizeof(struct ib_flow_ipv4_filter));
+ memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
+ sizeof(struct ib_flow_ipv4_filter));
+ break;
+ case IB_FLOW_SPEC_TCP:
+ case IB_FLOW_SPEC_UDP:
+ ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
+ memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val,
+ sizeof(struct ib_flow_tcp_udp_filter));
+ memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask,
+ sizeof(struct ib_flow_tcp_udp_filter));
+ break;
+ default:
+ return -EINVAL;
}
+ return 0;
+}
-err_insert_xrcd:
- idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_create_flow cmd;
+ struct ib_uverbs_create_flow_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_flow *flow_id;
+ struct ib_uverbs_flow_attr *kern_flow_attr;
+ struct ib_flow_attr *flow_attr;
+ struct ib_qp *qp;
+ int err = 0;
+ void *kern_spec;
+ void *ib_spec;
+ int i;
+
+ if (ucore->outlen < sizeof(resp))
+ return -ENOSPC;
-err_idr:
- ib_dealloc_xrcd(xrcd);
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
-err:
- put_uobj_write(uobj);
+ ucore->inbuf += sizeof(cmd);
+ ucore->inlen -= sizeof(cmd);
-err_table_mutex_unlock:
+ if (cmd.comp_mask)
+ return -EINVAL;
- if (f)
- fput(f);
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
- return ret;
-}
+ if (!priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement)
+ return -EPERM;
-ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
-{
- struct ib_uverbs_close_xrc_domain cmd;
- struct ib_uobject *uobj, *t_uobj;
- struct ib_uxrcd_object *xrcd_uobj;
- struct ib_xrcd *xrcd = NULL;
- struct inode *inode = NULL;
- int ret = 0;
+ if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+ return -EINVAL;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ if (cmd.flow_attr.size > ucore->inlen ||
+ cmd.flow_attr.size >
+ (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+ return -EINVAL;
- mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
- uobj = idr_write_uobj(&ib_uverbs_xrc_domain_idr, cmd.xrcd_handle,
- file->ucontext);
+ if (cmd.flow_attr.num_of_specs) {
+ kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) +
+ cmd.flow_attr.size, GFP_KERNEL);
+ if (!kern_flow_attr)
+ return -ENOMEM;
+
+ memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
+ err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+ cmd.flow_attr.size);
+ if (err)
+ goto err_free_attr;
+ } else {
+ kern_flow_attr = &cmd.flow_attr;
+ }
+
+ uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
if (!uobj) {
- ret = -EINVAL;
- goto err_unlock_mutex;
+ err = -ENOMEM;
+ goto err_free_attr;
}
+ init_uobj(uobj, 0, file->ucontext, &rule_lock_class);
+ down_write(&uobj->mutex);
- mutex_lock(&file->mutex);
- if (!ret) {
- list_for_each_entry(t_uobj, &file->ucontext->qp_list, list) {
- struct ib_qp *qp = t_uobj->object;
- if (qp->xrcd && qp->xrcd == uobj->object) {
- ret = -EBUSY;
- break;
- }
- }
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp) {
+ err = -EINVAL;
+ goto err_uobj;
}
- if (!ret) {
- list_for_each_entry(t_uobj, &file->ucontext->srq_list, list) {
- struct ib_srq *srq = t_uobj->object;
- if (srq->ext.xrc.xrcd && srq->ext.xrc.xrcd == uobj->object) {
- ret = -EBUSY;
- break;
- }
+
+ flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size,
+ GFP_KERNEL);
+ if (!flow_attr) {
+ err = -ENOMEM;
+ goto err_put;
+ }
+
+ flow_attr->type = kern_flow_attr->type;
+ flow_attr->priority = kern_flow_attr->priority;
+ flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+ flow_attr->port = kern_flow_attr->port;
+ flow_attr->flags = kern_flow_attr->flags;
+ flow_attr->size = sizeof(*flow_attr);
+
+ kern_spec = kern_flow_attr + 1;
+ ib_spec = flow_attr + 1;
+ for (i = 0; i < flow_attr->num_of_specs &&
+ cmd.flow_attr.size >
+ offsetof(struct ib_uverbs_flow_spec, reserved) &&
+ cmd.flow_attr.size >=
+ ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
+ err = kern_spec_to_ib_spec(kern_spec, ib_spec);
+ if (err)
+ goto err_free;
+ flow_attr->size +=
+ ((union ib_flow_spec *)ib_spec)->size;
+ cmd.flow_attr.size -=
+ ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+ kern_spec += ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+ ib_spec += ((union ib_flow_spec *)ib_spec)->size;
+ }
+ if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+ pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+ i, cmd.flow_attr.size);
+ goto err_free;
}
+ flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+ if (IS_ERR(flow_id)) {
+ err = PTR_ERR(flow_id);
+ goto err_free;
}
+ flow_id->qp = qp;
+ flow_id->uobject = uobj;
+ uobj->object = flow_id;
+
+ err = idr_add_uobj(&ib_uverbs_rule_idr, uobj);
+ if (err)
+ goto destroy_flow;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.flow_handle = uobj->id;
+
+ err = ib_copy_to_udata(ucore,
+ &resp, sizeof(resp));
+ if (err)
+ goto err_copy;
+
+ put_qp_read(qp);
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->rule_list);
mutex_unlock(&file->mutex);
- if (ret) {
- put_uobj_write(uobj);
- goto err_unlock_mutex;
- }
- xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
- if (!list_empty(&xrcd_uobj->xrc_reg_qp_list)) {
- ret = -EBUSY;
- put_uobj_write(uobj);
- goto err_unlock_mutex;
- }
+ uobj->live = 1;
- xrcd = (struct ib_xrcd *) (uobj->object);
- inode = xrcd->inode;
+ up_write(&uobj->mutex);
+ kfree(flow_attr);
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ return 0;
+err_copy:
+ idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+destroy_flow:
+ ib_destroy_flow(flow_id);
+err_free:
+ kfree(flow_attr);
+err_put:
+ put_qp_read(qp);
+err_uobj:
+ put_uobj_write(uobj);
+err_free_attr:
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ return err;
+}
- if (inode)
- atomic_dec(&xrcd->usecnt);
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_destroy_flow cmd;
+ struct ib_flow *flow_id;
+ struct ib_uobject *uobj;
+ int ret;
- ret = ib_dealloc_xrcd(uobj->object);
+ ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
+ file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ flow_id = uobj->object;
+
+ ret = ib_destroy_flow(flow_id);
if (!ret)
uobj->live = 0;
put_uobj_write(uobj);
- if (ret && !inode)
- goto err_unlock_mutex;
-
- if (!ret && inode)
- xrcd_table_delete(file->device->ib_dev, inode);
-
- idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+ idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
mutex_lock(&file->mutex);
list_del(&uobj->list);
@@ -2644,380 +3534,378 @@ ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file,
put_uobj(uobj);
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
- return in_len;
-
-err_unlock_mutex:
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
return ret;
}
-void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
- struct ib_xrcd *xrcd)
+ssize_t ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file,
+ struct ib_udata *ucore, struct ib_udata *uhw)
{
- struct inode *inode = NULL;
- int ret = 0;
+ const char __user *buf = ucore->inbuf;
+ int in_len = ucore->inlen + uhw->inlen;
+ int out_len = ucore->outlen + uhw->outlen;
- inode = xrcd->inode;
- if (inode)
- atomic_dec(&xrcd->usecnt);
+ return __uverbs_modify_qp(file, buf, in_len, out_len,
+ IB_USER_VERBS_CMD_EXTENDED);
+}
- ret = ib_dealloc_xrcd(xrcd);
- if (!ret && inode)
- xrcd_table_delete(ib_dev, inode);
+
+ssize_t ib_uverbs_exp_create_cq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore, struct ib_udata *uhw)
+{
+ const char __user *buf = ucore->inbuf;
+ int in_len = ucore->inlen + uhw->inlen;
+ int out_len = ucore->outlen + uhw->outlen;
+ struct ib_uverbs_create_cq_ex cmd;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ return create_cq(file, buf, in_len, out_len, &cmd,
+ IB_USER_VERBS_CMD_EXTENDED, ucore->outbuf);
}
-ssize_t ib_uverbs_create_xrc_rcv_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ssize_t ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file,
+ struct ib_udata *ucore, struct ib_udata *uhw)
{
- struct ib_uverbs_create_xrc_rcv_qp cmd;
- struct ib_uverbs_create_xrc_rcv_qp_resp resp;
- struct ib_uxrc_rcv_object *obj;
- struct ib_qp_init_attr init_attr;
- struct ib_xrcd *xrcd;
- struct ib_uobject *uobj;
- struct ib_uxrcd_object *xrcd_uobj;
- u32 qp_num;
- int err;
+ const char __user *buf = ucore->inbuf;
+ int in_len = ucore->inlen + uhw->inlen;
+ struct ib_uverbs_modify_cq_ex cmd;
+ struct ib_cq *cq;
+ struct ib_cq_attr attr;
+ int ret;
- if (out_len < sizeof resp)
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq)
+ return -EINVAL;
+
+ attr.moderation.cq_count = cmd.cq_count;
+ attr.moderation.cq_period = cmd.cq_period;
+ attr.cq_cap_flags = cmd.cq_cap_flags;
+
+ ret = ib_modify_cq(cq, &attr, cmd.attr_mask);
+
+ put_cq_read(cq);
+
+ return ret ? ret : in_len;
+}
+
+
+ssize_t ib_uverbs_exp_query_device(struct ib_uverbs_file *file,
+ struct ib_udata *ucore, struct ib_udata *uhw)
+{
+ struct ib_uverbs_exp_query_device_resp resp;
+ struct ib_exp_device_attr exp_attr;
+ int ret;
+
+ if (ucore->outlen + uhw->outlen < sizeof(resp))
return -ENOSPC;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ memset(&resp, 0, sizeof(resp));
+ memset(&exp_attr, 0, sizeof(exp_attr));
+ ret = ib_exp_query_device(file->device->ib_dev, &exp_attr);
+ if (ret)
+ return ret;
- obj = kzalloc(sizeof *obj, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
+ ib_uverbs_query_device_assign(&resp.base, &exp_attr.base, file);
- xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
- if (!xrcd) {
- err = -EINVAL;
- goto err_out;
+ resp.comp_mask = 0;
+ resp.device_cap_flags2 = 0;
+
+ /*
+ * Handle regular attr fields
+ */
+ if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) {
+ resp.timestamp_mask = exp_attr.base.timestamp_mask;
+ resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK;
}
- init_attr.event_handler = ib_uverbs_xrc_rcv_qp_event_handler;
- init_attr.qp_context = file;
- init_attr.srq = NULL;
- init_attr.sq_sig_type =
- cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
- init_attr.qp_type = IB_QPT_XRC;
- init_attr.xrcd = xrcd;
+ if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) {
+ resp.hca_core_clock = exp_attr.base.hca_core_clock;
+ resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK;
+ }
- init_attr.cap.max_send_wr = 1;
- init_attr.cap.max_recv_wr = 0;
- init_attr.cap.max_send_sge = 1;
- init_attr.cap.max_recv_sge = 0;
- init_attr.cap.max_inline_data = 0;
+ /*
+ * Handle experimental attr fields
+ */
+ if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2) {
+ resp.device_cap_flags2 = exp_attr.device_cap_flags2;
+ resp.comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2;
+ }
- err = xrcd->device->create_xrc_rcv_qp(&init_attr, &qp_num);
- if (err)
- goto err_put;
+ if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) {
+ resp.dc_rd_req = exp_attr.dc_rd_req;
+ resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD;
+ }
- memset(&resp, 0, sizeof resp);
- resp.qpn = qp_num;
+ if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) {
+ resp.dc_rd_res = exp_attr.dc_rd_res;
+ resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD;
+ }
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
- err = -EFAULT;
- goto err_destroy;
+ if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) {
+ resp.inline_recv_sz = exp_attr.inline_recv_sz;
+ resp.comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ;
}
- atomic_inc(&xrcd->usecnt);
- put_xrcd_read(uobj);
- obj->qp_num = qp_num;
- obj->domain_handle = cmd.xrc_domain_handle;
- xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
- mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
- list_add_tail(&obj->list, &xrcd_uobj->xrc_reg_qp_list);
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) {
+ resp.max_rss_tbl_sz = exp_attr.max_rss_tbl_sz;
+ resp.comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ;
+ }
- return in_len;
+ if (copy_to_user(ucore->outbuf, &resp, sizeof(resp)))
+ return -EFAULT;
-err_destroy:
- xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
-err_put:
- put_xrcd_read(uobj);
-err_out:
- kfree(obj);
- return err;
+ return ucore->inlen + uhw->inlen;
}
-ssize_t ib_uverbs_modify_xrc_rcv_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ssize_t ib_uverbs_exp_create_qp(struct ib_uverbs_file *file,
+ struct ib_udata *ucore, struct ib_udata *uhw)
{
- struct ib_uverbs_modify_xrc_rcv_qp cmd;
- struct ib_qp_attr *attr;
- struct ib_xrcd *xrcd;
- struct ib_uobject *uobj;
- int err;
+ struct ib_uqp_object *obj;
+ struct ib_device *device;
+ struct ib_pd *pd = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_cq *scq = NULL, *rcq = NULL;
+ struct ib_srq *srq = NULL;
+ struct ib_qp *qp;
+ struct ib_exp_qp_init_attr attr;
+ int ret;
+ struct ib_uverbs_exp_create_qp cmd_exp;
+ struct ib_uverbs_exp_create_qp_resp resp_exp;
+ struct ib_qp *parentqp = NULL;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ memset(&cmd_exp, 0, sizeof(cmd_exp));
- attr = kzalloc(sizeof *attr, GFP_KERNEL);
- if (!attr)
- return -ENOMEM;
+ ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp));
+ if (ret)
+ return ret;
- xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
- if (!xrcd) {
- kfree(attr);
- return -EINVAL;
- }
+ if (!disable_raw_qp_enforcement &&
+ cmd_exp.qp_type == IB_QPT_RAW_PACKET && !priv_check(curthread,
+ PRIV_NET_RAW))
+ return -EPERM;
- attr->qp_state = cmd.qp_state;
- attr->cur_qp_state = cmd.cur_qp_state;
- attr->qp_access_flags = cmd.qp_access_flags;
- attr->pkey_index = cmd.pkey_index;
- attr->port_num = cmd.port_num;
- attr->path_mtu = cmd.path_mtu;
- attr->path_mig_state = cmd.path_mig_state;
- attr->qkey = cmd.qkey;
- attr->rq_psn = cmd.rq_psn;
- attr->sq_psn = cmd.sq_psn;
- attr->dest_qp_num = cmd.dest_qp_num;
- attr->alt_pkey_index = cmd.alt_pkey_index;
- attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
- attr->max_rd_atomic = cmd.max_rd_atomic;
- attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic;
- attr->min_rnr_timer = cmd.min_rnr_timer;
- attr->port_num = cmd.port_num;
- attr->timeout = cmd.timeout;
- attr->retry_cnt = cmd.retry_cnt;
- attr->rnr_retry = cmd.rnr_retry;
- attr->alt_port_num = cmd.alt_port_num;
- attr->alt_timeout = cmd.alt_timeout;
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
- memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
- attr->ah_attr.grh.flow_label = cmd.dest.flow_label;
- attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index;
- attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit;
- attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class;
- attr->ah_attr.dlid = cmd.dest.dlid;
- attr->ah_attr.sl = cmd.dest.sl;
- attr->ah_attr.src_path_bits = cmd.dest.src_path_bits;
- attr->ah_attr.static_rate = cmd.dest.static_rate;
- attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0;
- attr->ah_attr.port_num = cmd.dest.port_num;
+ init_uobj(&obj->uevent.uobject, cmd_exp.user_handle, file->ucontext,
+ &qp_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
- memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
- attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label;
- attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index;
- attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit;
- attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
- attr->alt_ah_attr.dlid = cmd.alt_dest.dlid;
- attr->alt_ah_attr.sl = cmd.alt_dest.sl;
- attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits;
- attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate;
- attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
- attr->alt_ah_attr.port_num = cmd.alt_dest.port_num;
-
- err = xrcd->device->modify_xrc_rcv_qp(xrcd, cmd.qp_num, attr, cmd.attr_mask);
- put_xrcd_read(uobj);
- kfree(attr);
- return err ? err : in_len;
-}
+ if (cmd_exp.qp_type == IB_QPT_XRC_TGT) {
+ xrcd = idr_read_xrcd(cmd_exp.pd_handle, file->ucontext, &xrcd_uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ device = xrcd->device;
+ } else {
+ if (cmd_exp.qp_type == IB_QPT_XRC_INI) {
+ cmd_exp.max_recv_wr = 0;
+ cmd_exp.max_recv_sge = 0;
+ } else {
+ if (cmd_exp.is_srq) {
+ srq = idr_read_srq(cmd_exp.srq_handle, file->ucontext);
+ if (!srq || srq->srq_type != IB_SRQT_BASIC) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
-ssize_t ib_uverbs_query_xrc_rcv_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
-{
- struct ib_uverbs_query_xrc_rcv_qp cmd;
- struct ib_uverbs_query_qp_resp resp;
- struct ib_qp_attr *attr;
- struct ib_qp_init_attr *init_attr;
- struct ib_xrcd *xrcd;
- struct ib_uobject *uobj;
- int ret;
+ if (cmd_exp.recv_cq_handle != cmd_exp.send_cq_handle) {
+ rcq = idr_read_cq(cmd_exp.recv_cq_handle, file->ucontext, 0);
+ if (!rcq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+ }
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ scq = idr_read_cq(cmd_exp.send_cq_handle, file->ucontext, !!rcq);
+ rcq = rcq ?: scq;
+ pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext);
+ if (!pd || !scq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
- attr = kmalloc(sizeof *attr, GFP_KERNEL);
- init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
- if (!attr || !init_attr) {
- ret = -ENOMEM;
- goto out;
+ device = pd->device;
}
- xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
- if (!xrcd) {
- ret = -EINVAL;
- goto out;
+ memset(&attr, 0, sizeof(attr));
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_context = file;
+ attr.send_cq = scq;
+ attr.recv_cq = rcq;
+ attr.srq = srq;
+ attr.xrcd = xrcd;
+ attr.sq_sig_type = cmd_exp.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+ attr.qp_type = cmd_exp.qp_type;
+ attr.create_flags = 0;
+
+ attr.cap.max_send_wr = cmd_exp.max_send_wr;
+ attr.cap.max_recv_wr = cmd_exp.max_recv_wr;
+ attr.cap.max_send_sge = cmd_exp.max_send_sge;
+ attr.cap.max_recv_sge = cmd_exp.max_recv_sge;
+ attr.cap.max_inline_data = cmd_exp.max_inline_data;
+
+ if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS)
+ attr.create_flags |= cmd_exp.qp_cap_flags &
+ (IB_QP_CREATE_CROSS_CHANNEL |
+ IB_QP_CREATE_MANAGED_SEND |
+ IB_QP_CREATE_MANAGED_RECV);
+
+ if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) {
+ struct ib_uverbs_qpg *qpg;
+ if (cmd_exp.qp_type != IB_QPT_RAW_PACKET &&
+ cmd_exp.qp_type != IB_QPT_UD) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ qpg = &cmd_exp.qpg;
+ switch (qpg->qpg_type) {
+ case IB_QPG_PARENT:
+ attr.parent_attrib.rss_child_count =
+ qpg->parent_attrib.rss_child_count;
+ attr.parent_attrib.tss_child_count =
+ qpg->parent_attrib.tss_child_count;
+ break;
+ case IB_QPG_CHILD_RX:
+ case IB_QPG_CHILD_TX:
+ parentqp = idr_read_qp(qpg->parent_handle,
+ file->ucontext);
+ if (!parentqp) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ attr.qpg_parent = parentqp;
+ break;
+ default:
+ ret = -EINVAL;
+ goto err_put;
+ }
+ attr.qpg_type = qpg->qpg_type;
}
- ret = xrcd->device->query_xrc_rcv_qp(xrcd, cmd.qp_num, attr,
- cmd.attr_mask, init_attr);
+ if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV)
+ attr.max_inl_recv = cmd_exp.max_inl_recv;
- put_xrcd_read(uobj);
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
- if (ret)
- goto out;
+ if (cmd_exp.qp_type == IB_QPT_XRC_TGT)
+ qp = ib_create_qp(pd, (struct ib_qp_init_attr *)&attr);
+ else
+ qp = device->exp_create_qp(pd, &attr, uhw);
- memset(&resp, 0, sizeof resp);
- resp.qp_state = attr->qp_state;
- resp.cur_qp_state = attr->cur_qp_state;
- resp.path_mtu = attr->path_mtu;
- resp.path_mig_state = attr->path_mig_state;
- resp.qkey = attr->qkey;
- resp.rq_psn = attr->rq_psn;
- resp.sq_psn = attr->sq_psn;
- resp.dest_qp_num = attr->dest_qp_num;
- resp.qp_access_flags = attr->qp_access_flags;
- resp.pkey_index = attr->pkey_index;
- resp.alt_pkey_index = attr->alt_pkey_index;
- resp.sq_draining = attr->sq_draining;
- resp.max_rd_atomic = attr->max_rd_atomic;
- resp.max_dest_rd_atomic = attr->max_dest_rd_atomic;
- resp.min_rnr_timer = attr->min_rnr_timer;
- resp.port_num = attr->port_num;
- resp.timeout = attr->timeout;
- resp.retry_cnt = attr->retry_cnt;
- resp.rnr_retry = attr->rnr_retry;
- resp.alt_port_num = attr->alt_port_num;
- resp.alt_timeout = attr->alt_timeout;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
- memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
- resp.dest.flow_label = attr->ah_attr.grh.flow_label;
- resp.dest.sgid_index = attr->ah_attr.grh.sgid_index;
- resp.dest.hop_limit = attr->ah_attr.grh.hop_limit;
- resp.dest.traffic_class = attr->ah_attr.grh.traffic_class;
- resp.dest.dlid = attr->ah_attr.dlid;
- resp.dest.sl = attr->ah_attr.sl;
- resp.dest.src_path_bits = attr->ah_attr.src_path_bits;
- resp.dest.static_rate = attr->ah_attr.static_rate;
- resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
- resp.dest.port_num = attr->ah_attr.port_num;
+ if (cmd_exp.qp_type != IB_QPT_XRC_TGT) {
+ qp->real_qp = qp;
+ qp->device = device;
+ qp->pd = pd;
+ qp->send_cq = attr.send_cq;
+ qp->recv_cq = attr.recv_cq;
+ qp->srq = attr.srq;
+ qp->event_handler = attr.event_handler;
+ qp->qp_context = attr.qp_context;
+ qp->qp_type = attr.qp_type;
+ atomic_set(&qp->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&attr.send_cq->usecnt);
+ if (attr.recv_cq)
+ atomic_inc(&attr.recv_cq->usecnt);
+ if (attr.srq)
+ atomic_inc(&attr.srq->usecnt);
+ }
+ qp->uobject = &obj->uevent.uobject;
- memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
- resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label;
- resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index;
- resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit;
- resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
- resp.alt_dest.dlid = attr->alt_ah_attr.dlid;
- resp.alt_dest.sl = attr->alt_ah_attr.sl;
- resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
- resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate;
- resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
- resp.alt_dest.port_num = attr->alt_ah_attr.port_num;
+ obj->uevent.uobject.object = qp;
+ ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+ if (ret)
+ goto err_destroy;
- resp.max_send_wr = init_attr->cap.max_send_wr;
- resp.max_recv_wr = init_attr->cap.max_recv_wr;
- resp.max_send_sge = init_attr->cap.max_send_sge;
- resp.max_recv_sge = init_attr->cap.max_recv_sge;
- resp.max_inline_data = init_attr->cap.max_inline_data;
- resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+ memset(&resp_exp, 0, sizeof(resp_exp));
+ resp_exp.qpn = qp->qp_num;
+ resp_exp.qp_handle = obj->uevent.uobject.id;
+ resp_exp.max_recv_sge = attr.cap.max_recv_sge;
+ resp_exp.max_send_sge = attr.cap.max_send_sge;
+ resp_exp.max_recv_wr = attr.cap.max_recv_wr;
+ resp_exp.max_send_wr = attr.cap.max_send_wr;
+ resp_exp.max_inline_data = attr.cap.max_inline_data;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp))
- ret = -EFAULT;
+ if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) {
+ resp_exp.comp_mask |= IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV;
+ resp_exp.max_inl_recv = attr.max_inl_recv;
+ }
-out:
- kfree(attr);
- kfree(init_attr);
+ ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp));
+ if (ret)
+ goto err_copy;
- return ret ? ret : in_len;
-}
+ if (xrcd) {
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ put_xrcd_read(xrcd_uobj);
+ }
-ssize_t ib_uverbs_reg_xrc_rcv_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
-{
- struct ib_uverbs_reg_xrc_rcv_qp cmd;
- struct ib_uxrc_rcv_object *qp_obj, *tmp;
- struct ib_xrcd *xrcd;
- struct ib_uobject *uobj;
- struct ib_uxrcd_object *xrcd_uobj;
- int ret;
+ if (pd)
+ put_pd_read(pd);
+ if (scq)
+ put_cq_read(scq);
+ if (rcq && rcq != scq)
+ put_cq_read(rcq);
+ if (srq)
+ put_srq_read(srq);
+ if (parentqp)
+ put_qp_read(parentqp);
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+ mutex_unlock(&file->mutex);
- qp_obj = kmalloc(sizeof *qp_obj, GFP_KERNEL);
- if (!qp_obj)
- return -ENOMEM;
+ obj->uevent.uobject.live = 1;
- xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
- if (!xrcd) {
- ret = -EINVAL;
- goto err_out;
- }
+ up_write(&obj->uevent.uobject.mutex);
- ret = xrcd->device->reg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
- if (ret)
- goto err_put;
+ return ucore->inlen + uhw->inlen;
- xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
- mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
- list_for_each_entry(tmp, &xrcd_uobj->xrc_reg_qp_list, list)
- if (cmd.qp_num == tmp->qp_num) {
- kfree(qp_obj);
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
- put_xrcd_read(uobj);
- return in_len;
- }
- qp_obj->qp_num = cmd.qp_num;
- qp_obj->domain_handle = cmd.xrc_domain_handle;
- list_add_tail(&qp_obj->list, &xrcd_uobj->xrc_reg_qp_list);
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
- atomic_inc(&xrcd->usecnt);
- put_xrcd_read(uobj);
- return in_len;
+err_copy:
+ idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+ ib_destroy_qp(qp);
err_put:
- put_xrcd_read(uobj);
-err_out:
+ if (xrcd)
+ put_xrcd_read(xrcd_uobj);
+ if (pd)
+ put_pd_read(pd);
+ if (scq)
+ put_cq_read(scq);
+ if (rcq && rcq != scq)
+ put_cq_read(rcq);
+ if (srq)
+ put_srq_read(srq);
+ if (parentqp)
+ put_qp_read(parentqp);
- kfree(qp_obj);
+ put_uobj_write(&obj->uevent.uobject);
return ret;
}
-int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
- struct ib_xrcd *xrcd, u32 qp_num)
-{
- int err;
- err = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
- if (!err)
- atomic_dec(&xrcd->usecnt);
- return err;
-}
-
-ssize_t ib_uverbs_unreg_xrc_rcv_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+int ib_exp_query_device(struct ib_device *device,
+ struct ib_exp_device_attr *device_attr)
{
- struct ib_uverbs_unreg_xrc_rcv_qp cmd;
- struct ib_uxrc_rcv_object *qp_obj, *tmp;
- struct ib_xrcd *xrcd;
- struct ib_uobject *uobj;
- struct ib_uxrcd_object *xrcd_uobj;
- int ret;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
- if (!xrcd)
- return -EINVAL;
-
- ret = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
- if (ret) {
- put_xrcd_read(uobj);
- return -EINVAL;
- }
- atomic_dec(&xrcd->usecnt);
-
- xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
- mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
- list_for_each_entry_safe(qp_obj, tmp, &xrcd_uobj->xrc_reg_qp_list, list)
- if (cmd.qp_num == qp_obj->qp_num) {
- list_del(&qp_obj->list);
- kfree(qp_obj);
- break;
- }
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
- put_xrcd_read(uobj);
- return in_len;
+ return device->exp_query_device(device, device_attr);
}
+EXPORT_SYMBOL(ib_exp_query_device);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_main.c b/sys/ofed/drivers/infiniband/core/uverbs_main.c
index 30b9259..12bc0d3 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs_main.c
+++ b/sys/ofed/drivers/infiniband/core/uverbs_main.c
@@ -39,8 +39,13 @@
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/poll.h>
+#include <linux/sched.h>
#include <linux/file.h>
#include <linux/cdev.h>
+#include <linux/slab.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/math64.h>
#include <asm/uaccess.h>
@@ -50,8 +55,6 @@ MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand userspace verbs access");
MODULE_LICENSE("Dual BSD/GPL");
-#define INFINIBANDEVENTFS_MAGIC 0x49426576 /* "IBev" */
-
enum {
IB_UVERBS_MAJOR = 231,
IB_UVERBS_BASE_MINOR = 192,
@@ -60,6 +63,31 @@ enum {
#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+static int uverbs_copy_from_udata_ex(void *dest, struct ib_udata *udata, size_t len)
+{
+ return copy_from_user(dest, udata->inbuf, min(udata->inlen, len)) ? -EFAULT : 0;
+}
+
+static int uverbs_copy_to_udata_ex(struct ib_udata *udata, void *src, size_t len)
+{
+ return copy_to_user(udata->outbuf, src, min(udata->outlen, len)) ? -EFAULT : 0;
+}
+
+static struct ib_udata_ops uverbs_copy_ex = {
+ .copy_from = uverbs_copy_from_udata_ex,
+ .copy_to = uverbs_copy_to_udata_ex
+};
+
+#define INIT_UDATA_EX(udata, ibuf, obuf, ilen, olen) \
+ do { \
+ (udata)->ops = &uverbs_copy_ex; \
+ (udata)->inbuf = (void __user *)(ibuf); \
+ (udata)->outbuf = (void __user *)(obuf); \
+ (udata)->inlen = (ilen); \
+ (udata)->outlen = (olen); \
+ } while (0)
+
+
static struct class *uverbs_class;
DEFINE_SPINLOCK(ib_uverbs_idr_lock);
@@ -70,10 +98,11 @@ DEFINE_IDR(ib_uverbs_ah_idr);
DEFINE_IDR(ib_uverbs_cq_idr);
DEFINE_IDR(ib_uverbs_qp_idr);
DEFINE_IDR(ib_uverbs_srq_idr);
-DEFINE_IDR(ib_uverbs_xrc_domain_idr);
+DEFINE_IDR(ib_uverbs_xrcd_idr);
+DEFINE_IDR(ib_uverbs_rule_idr);
+DEFINE_IDR(ib_uverbs_dct_idr);
-static spinlock_t map_lock;
-static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES];
+static DEFINE_SPINLOCK(map_lock);
static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
@@ -86,6 +115,8 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
[IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd,
[IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr,
[IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr,
+ [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw,
+ [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw,
[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
[IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq,
[IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq,
@@ -107,20 +138,31 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
[IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq,
[IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq,
[IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
- [IB_USER_VERBS_CMD_CREATE_XRC_SRQ] = ib_uverbs_create_xrc_srq,
- [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrc_domain,
- [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrc_domain,
- [IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP] = ib_uverbs_create_xrc_rcv_qp,
- [IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP] = ib_uverbs_modify_xrc_rcv_qp,
- [IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP] = ib_uverbs_query_xrc_rcv_qp,
- [IB_USER_VERBS_CMD_REG_XRC_RCV_QP] = ib_uverbs_reg_xrc_rcv_qp,
- [IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP] = ib_uverbs_unreg_xrc_rcv_qp,
+ [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd,
+ [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd,
+ [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq,
+ [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp,
};
-#ifdef __linux__
-/* BSD Does not require a fake mountpoint for all files. */
-static struct vfsmount *uverbs_event_mnt;
-#endif
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw) = {
+ [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
+ [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
+};
+
+static ssize_t (*uverbs_exp_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw) = {
+ [IB_USER_VERBS_EXP_CMD_CREATE_QP] = ib_uverbs_exp_create_qp,
+ [IB_USER_VERBS_EXP_CMD_MODIFY_CQ] = ib_uverbs_exp_modify_cq,
+ [IB_USER_VERBS_EXP_CMD_MODIFY_QP] = ib_uverbs_exp_modify_qp,
+ [IB_USER_VERBS_EXP_CMD_CREATE_CQ] = ib_uverbs_exp_create_cq,
+ [IB_USER_VERBS_EXP_CMD_QUERY_DEVICE] = ib_uverbs_exp_query_device,
+ [IB_USER_VERBS_EXP_CMD_CREATE_DCT] = ib_uverbs_exp_create_dct,
+ [IB_USER_VERBS_EXP_CMD_DESTROY_DCT] = ib_uverbs_exp_destroy_dct,
+ [IB_USER_VERBS_EXP_CMD_QUERY_DCT] = ib_uverbs_exp_query_dct,
+};
static void ib_uverbs_add_one(struct ib_device *device);
static void ib_uverbs_remove_one(struct ib_device *device);
@@ -195,6 +237,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
struct ib_ucontext *context)
{
struct ib_uobject *uobj, *tmp;
+ int err;
if (!context)
return 0;
@@ -209,18 +252,55 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
kfree(uobj);
}
+ /* Remove MWs before QPs, in order to support type 2A MWs. */
+ list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) {
+ struct ib_mw *mw = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+ err = ib_dealloc_mw(mw);
+ if (err) {
+ pr_info("user_verbs: couldn't deallocate MW during cleanup.\n");
+ pr_info("user_verbs: the system may have become unstable.\n");
+ }
+ kfree(uobj);
+ }
+ list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) {
+ struct ib_flow *flow_id = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+ ib_destroy_flow(flow_id);
+ kfree(uobj);
+ }
+
list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
struct ib_qp *qp = uobj->object;
struct ib_uqp_object *uqp =
container_of(uobj, struct ib_uqp_object, uevent.uobject);
idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
ib_uverbs_detach_umcast(qp, uqp);
- ib_destroy_qp(qp);
+ err = ib_destroy_qp(qp);
+ if (err)
+ pr_info("destroying uverbs qp failed: err %d\n", err);
+
ib_uverbs_release_uevent(file, &uqp->uevent);
kfree(uqp);
}
+ list_for_each_entry_safe(uobj, tmp, &context->dct_list, list) {
+ struct ib_dct *dct = uobj->object;
+ struct ib_udct_object *udct =
+ container_of(uobj, struct ib_udct_object, uobject);
+
+ idr_remove_uobj(&ib_uverbs_dct_idr, uobj);
+
+ err = ib_destroy_dct(dct);
+ if (err)
+ pr_info("destroying uverbs dct failed: err %d\n", err);
+
+ kfree(udct);
+ }
list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
struct ib_srq *srq = uobj->object;
@@ -228,7 +308,9 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
container_of(uobj, struct ib_uevent_object, uobject);
idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
- ib_destroy_srq(srq);
+ err = ib_destroy_srq(srq);
+ if (err)
+ pr_info("destroying uverbs srq failed: err %d\n", err);
ib_uverbs_release_uevent(file, uevent);
kfree(uevent);
}
@@ -240,41 +322,37 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
container_of(uobj, struct ib_ucq_object, uobject);
idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
- ib_destroy_cq(cq);
+ err = ib_destroy_cq(cq);
+ if (err)
+ pr_info("destroying uverbs cq failed: err %d\n", err);
+
ib_uverbs_release_ucq(file, ev_file, ucq);
kfree(ucq);
}
- /* XXX Free MWs */
-
list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
struct ib_mr *mr = uobj->object;
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
- ib_dereg_mr(mr);
+ err = ib_dereg_mr(mr);
+ if (err) {
+ pr_info("user_verbs: couldn't deregister an MR during cleanup.\n");
+ pr_info("user_verbs: the system may have become unstable.\n");
+ }
kfree(uobj);
}
- mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+ mutex_lock(&file->device->xrcd_tree_mutex);
list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) {
struct ib_xrcd *xrcd = uobj->object;
- struct ib_uxrc_rcv_object *xrc_qp_obj, *tmp1;
- struct ib_uxrcd_object *xrcd_uobj =
+ struct ib_uxrcd_object *uxrcd =
container_of(uobj, struct ib_uxrcd_object, uobject);
- list_for_each_entry_safe(xrc_qp_obj, tmp1,
- &xrcd_uobj->xrc_reg_qp_list, list) {
- list_del(&xrc_qp_obj->list);
- ib_uverbs_cleanup_xrc_rcv_qp(file, xrcd,
- xrc_qp_obj->qp_num);
- kfree(xrc_qp_obj);
- }
-
- idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
- ib_uverbs_dealloc_xrcd(file->device->ib_dev, xrcd);
- kfree(uobj);
+ idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+ ib_uverbs_dealloc_xrcd(file->device, xrcd);
+ kfree(uxrcd);
}
- mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ mutex_unlock(&file->device->xrcd_tree_mutex);
list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
struct ib_pd *pd = uobj->object;
@@ -405,7 +483,8 @@ static const struct file_operations uverbs_event_fops = {
.read = ib_uverbs_event_read,
.poll = ib_uverbs_event_poll,
.release = ib_uverbs_event_close,
- .fasync = ib_uverbs_event_fasync
+ .fasync = ib_uverbs_event_fasync,
+ .llseek = no_llseek,
};
void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
@@ -524,21 +603,13 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
NULL, NULL);
}
-void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
- void *context_ptr)
-{
- ib_uverbs_async_handler(context_ptr, event->element.xrc_qp_num,
- event->event, NULL, NULL);
-}
-
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
- int is_async, int *fd)
+ int is_async)
{
struct ib_uverbs_event_file *ev_file;
struct file *filp;
- int ret;
- ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+ ev_file = kzalloc(sizeof *ev_file, GFP_KERNEL);
if (!ev_file)
return ERR_PTR(-ENOMEM);
@@ -547,43 +618,22 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
INIT_LIST_HEAD(&ev_file->event_list);
init_waitqueue_head(&ev_file->poll_wait);
ev_file->uverbs_file = uverbs_file;
- ev_file->async_queue = NULL;
ev_file->is_async = is_async;
- ev_file->is_closed = 0;
- ev_file->filp = NULL;
-
- *fd = get_unused_fd();
- if (*fd < 0) {
- ret = *fd;
- goto err;
- }
/*
* fops_get() can't fail here, because we're coming from a
* system call on a uverbs file, which will already have a
* module reference.
*/
-#ifdef __linux__
- filp = alloc_file(uverbs_event_mnt, dget(uverbs_event_mnt->mnt_root),
- FMODE_READ, fops_get(&uverbs_event_fops));
-#else
filp = alloc_file(FMODE_READ, fops_get(&uverbs_event_fops));
-#endif
- if (!filp) {
- ret = -ENFILE;
- goto err_fd;
- }
+ if (IS_ERR(filp)) {
+ kfree(ev_file);
+ } else {
filp->private_data = ev_file;
+ }
return filp;
-
-err_fd:
- put_unused_fd(*fd);
-
-err:
- kfree(ev_file);
- return ERR_PTR(ret);
}
/*
@@ -594,16 +644,15 @@ err:
struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
{
struct ib_uverbs_event_file *ev_file = NULL;
- struct file *filp;
+ struct fd f = fdget(fd);
- filp = fget(fd);
- if (!filp)
+ if (!f.file)
return NULL;
- if (filp->f_op != &uverbs_event_fops)
+ if (f.file->f_op != &uverbs_event_fops)
goto out;
- ev_file = filp->private_data;
+ ev_file = f.file->private_data;
if (ev_file->is_async) {
ev_file = NULL;
goto out;
@@ -612,15 +661,225 @@ struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
kref_get(&ev_file->ref);
out:
- fput(filp);
+ fdput(f);
return ev_file;
}
+static const char *verbs_cmd_str(__u32 cmd)
+{
+ switch (cmd) {
+ case IB_USER_VERBS_CMD_GET_CONTEXT:
+ return "GET_CONTEXT";
+ case IB_USER_VERBS_CMD_QUERY_DEVICE:
+ return "QUERY_DEVICE";
+ case IB_USER_VERBS_CMD_QUERY_PORT:
+ return "QUERY_PORT";
+ case IB_USER_VERBS_CMD_ALLOC_PD:
+ return "ALLOC_PD";
+ case IB_USER_VERBS_CMD_DEALLOC_PD:
+ return "DEALLOC_PD";
+ case IB_USER_VERBS_CMD_REG_MR:
+ return "REG_MR";
+ case IB_USER_VERBS_CMD_DEREG_MR:
+ return "DEREG_MR";
+ case IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL:
+ return "CREATE_COMP_CHANNEL";
+ case IB_USER_VERBS_CMD_CREATE_CQ:
+ return "CREATE_CQ";
+ case IB_USER_VERBS_CMD_RESIZE_CQ:
+ return "RESIZE_CQ";
+ case IB_USER_VERBS_CMD_POLL_CQ:
+ return "POLL_CQ";
+ case IB_USER_VERBS_CMD_REQ_NOTIFY_CQ:
+ return "REQ_NOTIFY_CQ";
+ case IB_USER_VERBS_CMD_DESTROY_CQ:
+ return "DESTROY_CQ";
+ case IB_USER_VERBS_CMD_CREATE_QP:
+ return "CREATE_QP";
+ case IB_USER_VERBS_CMD_QUERY_QP:
+ return "QUERY_QP";
+ case IB_USER_VERBS_CMD_MODIFY_QP:
+ return "MODIFY_QP";
+ case IB_USER_VERBS_CMD_DESTROY_QP:
+ return "DESTROY_QP";
+ case IB_USER_VERBS_CMD_POST_SEND:
+ return "POST_SEND";
+ case IB_USER_VERBS_CMD_POST_RECV:
+ return "POST_RECV";
+ case IB_USER_VERBS_CMD_POST_SRQ_RECV:
+ return "POST_SRQ_RECV";
+ case IB_USER_VERBS_CMD_CREATE_AH:
+ return "CREATE_AH";
+ case IB_USER_VERBS_CMD_DESTROY_AH:
+ return "DESTROY_AH";
+ case IB_USER_VERBS_CMD_ATTACH_MCAST:
+ return "ATTACH_MCAST";
+ case IB_USER_VERBS_CMD_DETACH_MCAST:
+ return "DETACH_MCAST";
+ case IB_USER_VERBS_CMD_CREATE_SRQ:
+ return "CREATE_SRQ";
+ case IB_USER_VERBS_CMD_MODIFY_SRQ:
+ return "MODIFY_SRQ";
+ case IB_USER_VERBS_CMD_QUERY_SRQ:
+ return "QUERY_SRQ";
+ case IB_USER_VERBS_CMD_DESTROY_SRQ:
+ return "DESTROY_SRQ";
+ case IB_USER_VERBS_CMD_OPEN_XRCD:
+ return "OPEN_XRCD";
+ case IB_USER_VERBS_CMD_CLOSE_XRCD:
+ return "CLOSE_XRCD";
+ case IB_USER_VERBS_CMD_CREATE_XSRQ:
+ return "CREATE_XSRQ";
+ case IB_USER_VERBS_CMD_OPEN_QP:
+ return "OPEN_QP";
+ }
+
+ return "Unknown command";
+}
+
+enum {
+ COMMAND_INFO_MASK = 0x1000,
+};
+
+static ssize_t ib_uverbs_exp_handle_cmd(struct ib_uverbs_file *file,
+ const char __user *buf,
+ struct ib_device *dev,
+ struct ib_uverbs_cmd_hdr *hdr,
+ size_t count,
+ int legacy_ex_cmd)
+{
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ struct ib_uverbs_ex_cmd_hdr ex_hdr;
+ __u32 command = hdr->command - IB_USER_VERBS_EXP_CMD_FIRST;
+
+ if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+ IB_USER_VERBS_CMD_COMMAND_MASK))
+ return -EINVAL;
+
+ if (command >= ARRAY_SIZE(uverbs_exp_cmd_table) ||
+ !uverbs_exp_cmd_table[command])
+ return -EINVAL;
+
+ if (!file->ucontext)
+ return -EINVAL;
+
+ if (!(dev->uverbs_exp_cmd_mask & (1ull << command)))
+ return -ENOSYS;
+
+ if (legacy_ex_cmd) {
+ struct ib_uverbs_ex_cmd_hdr_legacy hxl;
+ struct ib_uverbs_ex_cmd_resp1_legacy resp1;
+ __u64 response;
+ ssize_t ret;
+
+ if (count < sizeof(hxl))
+ return -EINVAL;
+
+ if (copy_from_user(&hxl, buf, sizeof(hxl)))
+ return -EFAULT;
+
+ if (((hxl.in_words + hxl.provider_in_words) * 4) != count)
+ return -EINVAL;
+
+ count -= sizeof(hxl);
+ buf += sizeof(hxl);
+ if (hxl.out_words || hxl.provider_out_words) {
+ if (count < sizeof(resp1))
+ return -EINVAL;
+ if (copy_from_user(&resp1, buf, sizeof(resp1)))
+ return -EFAULT;
+ response = resp1.response;
+ if (!response)
+ return -EINVAL;
+
+ /*
+ * Change user buffer to comply with new extension format.
+ */
+ if (sizeof(resp1.comp_mask) != sizeof(resp1.response))
+ return -EFAULT;
+ buf += sizeof(resp1.comp_mask);
+ if (copy_to_user(__DECONST(void __user *, buf), &resp1.comp_mask,
+ sizeof(resp1.response)))
+ return -EFAULT;
+
+ } else {
+ response = 0;
+ }
+
+ INIT_UDATA_EX(&ucore,
+ (hxl.in_words) ? buf : 0,
+ response,
+ hxl.in_words * 4,
+ hxl.out_words * 4);
+
+ INIT_UDATA_EX(&uhw,
+ (hxl.provider_in_words) ? buf + ucore.inlen : 0,
+ (hxl.provider_out_words) ? response + ucore.outlen : 0,
+ hxl.provider_in_words * 4,
+ hxl.provider_out_words * 4);
+
+ ret = uverbs_exp_cmd_table[command](file, &ucore, &uhw);
+ /*
+ * UnChange user buffer
+ */
+ if (response && copy_to_user(__DECONST(void __user *, buf), &resp1.response, sizeof(resp1.response)))
+ return -EFAULT;
+
+ return ret;
+ } else {
+ if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+ return -EINVAL;
+
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+ return -EFAULT;
+
+ buf += sizeof(hdr) + sizeof(ex_hdr);
+
+ if ((hdr->in_words + ex_hdr.provider_in_words) * 8 != count)
+ return -EINVAL;
+
+ if (ex_hdr.response) {
+ if (!hdr->out_words && !ex_hdr.provider_out_words)
+ return -EINVAL;
+ } else {
+ if (hdr->out_words || ex_hdr.provider_out_words)
+ return -EINVAL;
+ }
+
+ INIT_UDATA_EX(&ucore,
+ (hdr->in_words) ? buf : 0,
+ (unsigned long)ex_hdr.response,
+ hdr->in_words * 8,
+ hdr->out_words * 8);
+
+ INIT_UDATA_EX(&uhw,
+ (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0,
+ (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0,
+ ex_hdr.provider_in_words * 8,
+ ex_hdr.provider_out_words * 8);
+
+ return uverbs_exp_cmd_table[command](file, &ucore, &uhw);
+ }
+}
+
static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *dev = file->device->ib_dev;
struct ib_uverbs_cmd_hdr hdr;
+ struct timespec ts1;
+ struct timespec ts2;
+ ktime_t t1, t2, delta;
+ s64 ds;
+ ssize_t ret;
+ u64 dividend;
+ u32 divisor;
+ __u32 flags;
+ __u32 command;
+ int legacy_ex_cmd = 0;
+ size_t written_count = count;
if (count < sizeof hdr)
return -EINVAL;
@@ -628,20 +887,126 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof hdr))
return -EFAULT;
+ /*
+ * For BWD compatibility change old style extension verbs commands
+ * to their equivalent experimental command.
+ */
+ if ((hdr.command >= IB_USER_VERBS_LEGACY_CMD_FIRST) &&
+ (hdr.command <= IB_USER_VERBS_LEGACY_EX_CMD_LAST)) {
+ hdr.command += IB_USER_VERBS_EXP_CMD_FIRST -
+ IB_USER_VERBS_LEGACY_CMD_FIRST;
+ legacy_ex_cmd = 1;
+ }
+
+ flags = (hdr.command &
+ IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
+ command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+ ktime_get_ts(&ts1);
+ if (!flags && (command >= IB_USER_VERBS_EXP_CMD_FIRST)) {
+ ret = ib_uverbs_exp_handle_cmd(file, buf, dev, &hdr, count, legacy_ex_cmd);
+ } else if (!flags) {
+ if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
+ !uverbs_cmd_table[command])
+ return -EINVAL;
+
+ if (!file->ucontext &&
+ command != IB_USER_VERBS_CMD_GET_CONTEXT)
+ return -EINVAL;
+
+ if (!(dev->uverbs_cmd_mask & (1ull << command)))
+ return -ENOSYS;
+
if (hdr.in_words * 4 != count)
return -EINVAL;
- if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) ||
- !uverbs_cmd_table[hdr.command] ||
- !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command)))
+ ret = uverbs_cmd_table[command](file,
+ buf + sizeof(hdr),
+ hdr.in_words * 4,
+ hdr.out_words * 4);
+ } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ struct ib_uverbs_ex_cmd_hdr ex_hdr;
+
+ if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+ IB_USER_VERBS_CMD_COMMAND_MASK))
return -EINVAL;
- if (!file->ucontext &&
- hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT)
+ if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
+ !uverbs_ex_cmd_table[command])
+ return -EINVAL;
+
+ if (!file->ucontext)
+ return -EINVAL;
+
+ if (!(dev->uverbs_ex_cmd_mask & (1ull << command)))
+ return -ENOSYS;
+
+ if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+ return -EINVAL;
+
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+ return -EFAULT;
+
+ count -= sizeof(hdr) + sizeof(ex_hdr);
+ buf += sizeof(hdr) + sizeof(ex_hdr);
+
+ if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
+ return -EINVAL;
+
+ if (ex_hdr.response) {
+ if (!hdr.out_words && !ex_hdr.provider_out_words)
+ return -EINVAL;
+ } else {
+ if (hdr.out_words || ex_hdr.provider_out_words)
return -EINVAL;
+ }
+
+ INIT_UDATA_EX(&ucore,
+ (hdr.in_words) ? buf : 0,
+ (unsigned long)ex_hdr.response,
+ hdr.in_words * 8,
+ hdr.out_words * 8);
+
+ INIT_UDATA_EX(&uhw,
+ (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0,
+ (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0,
+ ex_hdr.provider_in_words * 8,
+ ex_hdr.provider_out_words * 8);
+
+ ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw);
+
+ if (ret)
+ return ret;
+
+ return written_count;
+
+ } else {
+ return -EFAULT;
+ }
- return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr,
- hdr.in_words * 4, hdr.out_words * 4);
+ if ((dev->cmd_perf & (COMMAND_INFO_MASK - 1)) == hdr.command) {
+ ktime_get_ts(&ts2);
+ t1 = timespec_to_ktime(ts1);
+ t2 = timespec_to_ktime(ts2);
+ delta = ktime_sub(t2, t1);
+ ds = ktime_to_ns(delta);
+ spin_lock(&dev->cmd_perf_lock);
+ dividend = dev->cmd_avg * dev->cmd_n + ds;
+ ++dev->cmd_n;
+ divisor = dev->cmd_n;
+ do_div(dividend, divisor);
+ dev->cmd_avg = dividend;
+ spin_unlock(&dev->cmd_perf_lock);
+ if (dev->cmd_perf & COMMAND_INFO_MASK) {
+ pr_info("%s: %s execution time = %lld nsec\n",
+ file->device->ib_dev->name,
+ verbs_cmd_str(hdr.command),
+ (long long)ds);
+ }
+ }
+ return ret;
}
static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -653,18 +1018,51 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
else
return file->device->ib_dev->mmap(file->ucontext, vma);
}
+/* XXX Not supported in FreeBSD */
+#if 0
+static unsigned long ib_uverbs_get_unmapped_area(struct file *filp,
+ unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ if (!file->ucontext)
+ return -ENODEV;
+ else {
+ if (!file->device->ib_dev->get_unmapped_area)
+ return current->mm->get_unmapped_area(filp, addr, len,
+ pgoff, flags);
+
+ return file->device->ib_dev->get_unmapped_area(filp, addr, len,
+ pgoff, flags);
+ }
+}
+#endif
+
+static long ib_uverbs_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ if (!file->device->ib_dev->ioctl)
+ return -ENOTSUPP;
+
+ if (!file->ucontext)
+ return -ENODEV;
+ else
+ /* provider should provide it's own locking mechanism */
+ return file->device->ib_dev->ioctl(file->ucontext, cmd, arg);
+}
/*
* ib_uverbs_open() does not need the BKL:
*
- * - dev_table[] accesses are protected by map_lock, the
- * ib_uverbs_device structures are properly reference counted, and
+ * - the ib_uverbs_device structures are properly reference counted and
* everything else is purely local to the file being created, so
* races against other open calls are not a problem;
* - there is no ioctl method to race against;
- * - the device is added to dev_table[] as the last part of module
- * initialization, the open method will either immediately run
- * -ENXIO, or all required initialization will be done.
+ * - the open method will either immediately run -ENXIO, or all
+ * required initialization will be done.
*/
static int ib_uverbs_open(struct inode *inode, struct file *filp)
{
@@ -672,13 +1070,10 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
struct ib_uverbs_file *file;
int ret;
- spin_lock(&map_lock);
- dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR];
+ dev = container_of(inode->i_cdev->si_drv1, struct ib_uverbs_device, cdev);
if (dev)
kref_get(&dev->ref);
- spin_unlock(&map_lock);
-
- if (!dev)
+ else
return -ENXIO;
if (!try_module_get(dev->ib_dev->owner)) {
@@ -700,7 +1095,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
filp->private_data = file;
- return 0;
+ return nonseekable_open(inode, filp);
err_module:
module_put(dev->ib_dev->owner);
@@ -728,7 +1123,9 @@ static const struct file_operations uverbs_fops = {
.owner = THIS_MODULE,
.write = ib_uverbs_write,
.open = ib_uverbs_open,
- .release = ib_uverbs_close
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+ .unlocked_ioctl = ib_uverbs_ioctl,
};
static const struct file_operations uverbs_mmap_fops = {
@@ -736,7 +1133,13 @@ static const struct file_operations uverbs_mmap_fops = {
.write = ib_uverbs_write,
.mmap = ib_uverbs_mmap,
.open = ib_uverbs_open,
- .release = ib_uverbs_close
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+/* XXX Not supported in FreeBSD */
+#if 0
+ .get_unmapped_area = ib_uverbs_get_unmapped_area,
+#endif
+ .unlocked_ioctl = ib_uverbs_ioctl,
};
static struct ib_client uverbs_client = {
@@ -757,6 +1160,18 @@ static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
}
static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static ssize_t show_dev_ref_cnt(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+ if (!dev)
+ return -ENODEV;
+
+ return sprintf(buf, "%d\n", dev->ref.count);
+}
+static DEVICE_ATTR(ref_cnt, S_IRUGO, show_dev_ref_cnt, NULL);
+
static ssize_t show_dev_abi_version(struct device *device,
struct device_attribute *attr, char *buf)
{
@@ -773,8 +1188,36 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *att
{
return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION);
}
+
static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
+
+/*
+ * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
+ * requesting a new major number and doubling the number of max devices we
+ * support. It's stupid, but simple.
+ */
+static int find_overflow_devnum(void)
+{
+ int ret;
+
+ if (!overflow_maj) {
+ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
+ "infiniband_verbs");
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+ return ret;
+ }
+ }
+
+ ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
+ if (ret >= IB_UVERBS_MAX_DEVICES)
+ return -1;
+
+ return ret;
+}
#include <linux/pci.h>
static ssize_t
@@ -801,6 +1244,7 @@ show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf)
return sprintf(buf, "0x%04x\n",
((struct pci_dev *)dev->ib_dev->dma_device)->vendor);
}
+
static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL);
struct attribute *device_attrs[] =
@@ -817,6 +1261,8 @@ static struct attribute_group device_group = {
static void ib_uverbs_add_one(struct ib_device *device)
{
+ int devnum;
+ dev_t base;
struct ib_uverbs_device *uverbs_dev;
if (!device->alloc_ucontext)
@@ -828,55 +1274,66 @@ static void ib_uverbs_add_one(struct ib_device *device)
kref_init(&uverbs_dev->ref);
init_completion(&uverbs_dev->comp);
+ uverbs_dev->xrcd_tree = RB_ROOT;
+ mutex_init(&uverbs_dev->xrcd_tree_mutex);
spin_lock(&map_lock);
- uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
- if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) {
+ devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+ if (devnum >= IB_UVERBS_MAX_DEVICES) {
spin_unlock(&map_lock);
+ devnum = find_overflow_devnum();
+ if (devnum < 0)
goto err;
+
+ spin_lock(&map_lock);
+ uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
+ base = devnum + overflow_maj;
+ set_bit(devnum, overflow_map);
+ } else {
+ uverbs_dev->devnum = devnum;
+ base = devnum + IB_UVERBS_BASE_DEV;
+ set_bit(devnum, dev_map);
}
- set_bit(uverbs_dev->devnum, dev_map);
spin_unlock(&map_lock);
uverbs_dev->ib_dev = device;
uverbs_dev->num_comp_vectors = device->num_comp_vectors;
- uverbs_dev->cdev = cdev_alloc();
- if (!uverbs_dev->cdev)
- goto err;
- uverbs_dev->cdev->owner = THIS_MODULE;
- uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
- kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum);
- if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1))
+ cdev_init(&uverbs_dev->cdev, NULL);
+ uverbs_dev->cdev.owner = THIS_MODULE;
+ uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+ kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+ if (cdev_add(&uverbs_dev->cdev, base, 1))
goto err_cdev;
uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
- uverbs_dev->cdev->dev, uverbs_dev,
+ uverbs_dev->cdev.dev, uverbs_dev,
"uverbs%d", uverbs_dev->devnum);
if (IS_ERR(uverbs_dev->dev))
goto err_cdev;
if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
goto err_class;
+ if (device_create_file(uverbs_dev->dev, &dev_attr_ref_cnt))
+ goto err_class;
if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
goto err_class;
if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group))
goto err_class;
- spin_lock(&map_lock);
- dev_table[uverbs_dev->devnum] = uverbs_dev;
- spin_unlock(&map_lock);
-
ib_set_client_data(device, &uverbs_client, uverbs_dev);
return;
err_class:
- device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+ device_destroy(uverbs_class, uverbs_dev->cdev.dev);
err_cdev:
- cdev_del(uverbs_dev->cdev);
- clear_bit(uverbs_dev->devnum, dev_map);
+ cdev_del(&uverbs_dev->cdev);
+ if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+ clear_bit(devnum, dev_map);
+ else
+ clear_bit(devnum, overflow_map);
err:
kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
@@ -894,42 +1351,30 @@ static void ib_uverbs_remove_one(struct ib_device *device)
sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group);
dev_set_drvdata(uverbs_dev->dev, NULL);
- device_destroy(uverbs_class, uverbs_dev->cdev->dev);
- cdev_del(uverbs_dev->cdev);
-
- spin_lock(&map_lock);
- dev_table[uverbs_dev->devnum] = NULL;
- spin_unlock(&map_lock);
+ device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+ cdev_del(&uverbs_dev->cdev);
+ if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
clear_bit(uverbs_dev->devnum, dev_map);
+ else
+ clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
wait_for_completion(&uverbs_dev->comp);
kfree(uverbs_dev);
}
-#ifdef __linux__
-static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data,
- struct vfsmount *mnt)
+
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
{
- return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
- INFINIBANDEVENTFS_MAGIC, mnt);
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
}
-static struct file_system_type uverbs_event_fs = {
- /* No owner field so module can be unloaded */
- .name = "infinibandeventfs",
- .get_sb = uverbs_event_get_sb,
- .kill_sb = kill_litter_super
-};
-#endif
-
static int __init ib_uverbs_init(void)
{
int ret;
- spin_lock_init(&map_lock);
-
ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
"infiniband_verbs");
if (ret) {
@@ -944,43 +1389,22 @@ static int __init ib_uverbs_init(void)
goto out_chrdev;
}
+ uverbs_class->devnode = uverbs_devnode;
+
ret = class_create_file(uverbs_class, &class_attr_abi_version);
if (ret) {
printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
goto out_class;
}
-#ifdef __linux__
- ret = register_filesystem(&uverbs_event_fs);
- if (ret) {
- printk(KERN_ERR "user_verbs: couldn't register infinibandeventfs\n");
- goto out_class;
- }
-
- uverbs_event_mnt = kern_mount(&uverbs_event_fs);
- if (IS_ERR(uverbs_event_mnt)) {
- ret = PTR_ERR(uverbs_event_mnt);
- printk(KERN_ERR "user_verbs: couldn't mount infinibandeventfs\n");
- goto out_fs;
- }
-#endif
-
ret = ib_register_client(&uverbs_client);
if (ret) {
printk(KERN_ERR "user_verbs: couldn't register client\n");
- goto out_mnt;
+ goto out_class;
}
return 0;
-out_mnt:
-#ifdef __linux__
- mntput(uverbs_event_mnt);
-
-out_fs:
- unregister_filesystem(&uverbs_event_fs);
-#endif
-
out_class:
class_destroy(uverbs_class);
@@ -994,12 +1418,10 @@ out:
static void __exit ib_uverbs_cleanup(void)
{
ib_unregister_client(&uverbs_client);
-#ifdef __linux__
- mntput(uverbs_event_mnt);
- unregister_filesystem(&uverbs_event_fs);
-#endif
class_destroy(uverbs_class);
unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+ if (overflow_maj)
+ unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
idr_destroy(&ib_uverbs_pd_idr);
idr_destroy(&ib_uverbs_mr_idr);
idr_destroy(&ib_uverbs_mw_idr);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
index 5440da0..a541882 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
+++ b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
@@ -30,6 +30,7 @@
* SOFTWARE.
*/
+#include <linux/module.h>
#include <rdma/ib_marshall.h>
void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
@@ -40,18 +41,21 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
dst->grh.sgid_index = src->grh.sgid_index;
dst->grh.hop_limit = src->grh.hop_limit;
dst->grh.traffic_class = src->grh.traffic_class;
+ memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved));
dst->dlid = src->dlid;
dst->sl = src->sl;
dst->src_path_bits = src->src_path_bits;
dst->static_rate = src->static_rate;
dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0;
dst->port_num = src->port_num;
+ dst->reserved = 0;
}
EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
struct ib_qp_attr *src)
{
+ dst->qp_state = src->qp_state;
dst->cur_qp_state = src->cur_qp_state;
dst->path_mtu = src->path_mtu;
dst->path_mig_state = src->path_mig_state;
@@ -83,6 +87,7 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
dst->rnr_retry = src->rnr_retry;
dst->alt_port_num = src->alt_port_num;
dst->alt_timeout = src->alt_timeout;
+ memset(dst->reserved, 0, sizeof(dst->reserved));
}
EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
diff --git a/sys/ofed/drivers/infiniband/core/verbs.c b/sys/ofed/drivers/infiniband/core/verbs.c
index 023564f..51a0ed5 100644
--- a/sys/ofed/drivers/infiniband/core/verbs.c
+++ b/sys/ofed/drivers/infiniband/core/verbs.c
@@ -38,10 +38,13 @@
#include <linux/errno.h>
#include <linux/err.h>
+#include <linux/module.h>
#include <linux/string.h>
+#include <linux/slab.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
int ib_rate_to_mult(enum ib_rate rate)
{
@@ -77,6 +80,31 @@ enum ib_rate mult_to_ib_rate(int mult)
}
EXPORT_SYMBOL(mult_to_ib_rate);
+int ib_rate_to_mbps(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 2500;
+ case IB_RATE_5_GBPS: return 5000;
+ case IB_RATE_10_GBPS: return 10000;
+ case IB_RATE_20_GBPS: return 20000;
+ case IB_RATE_30_GBPS: return 30000;
+ case IB_RATE_40_GBPS: return 40000;
+ case IB_RATE_60_GBPS: return 60000;
+ case IB_RATE_80_GBPS: return 80000;
+ case IB_RATE_120_GBPS: return 120000;
+ case IB_RATE_14_GBPS: return 14062;
+ case IB_RATE_56_GBPS: return 56250;
+ case IB_RATE_112_GBPS: return 112500;
+ case IB_RATE_168_GBPS: return 168750;
+ case IB_RATE_25_GBPS: return 25781;
+ case IB_RATE_100_GBPS: return 103125;
+ case IB_RATE_200_GBPS: return 206250;
+ case IB_RATE_300_GBPS: return 309375;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
enum rdma_transport_type
rdma_node_get_transport(enum rdma_node_type node_type)
{
@@ -87,6 +115,8 @@ rdma_node_get_transport(enum rdma_node_type node_type)
return RDMA_TRANSPORT_IB;
case RDMA_NODE_RNIC:
return RDMA_TRANSPORT_IWARP;
+ case RDMA_NODE_MIC:
+ return RDMA_TRANSPORT_SCIF;
default:
BUG();
return 0;
@@ -104,6 +134,8 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_
return IB_LINK_LAYER_INFINIBAND;
case RDMA_TRANSPORT_IWARP:
return IB_LINK_LAYER_ETHERNET;
+ case RDMA_TRANSPORT_SCIF:
+ return IB_LINK_LAYER_SCIF;
default:
return IB_LINK_LAYER_UNSPECIFIED;
}
@@ -162,8 +194,29 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
u32 flow_class;
u16 gid_index;
int ret;
+ int is_eth = (rdma_port_get_link_layer(device, port_num) ==
+ IB_LINK_LAYER_ETHERNET);
memset(ah_attr, 0, sizeof *ah_attr);
+ if (is_eth) {
+ if (!(wc->wc_flags & IB_WC_GRH))
+ return -EPROTOTYPE;
+
+ if (wc->wc_flags & IB_WC_WITH_SMAC &&
+ wc->wc_flags & IB_WC_WITH_VLAN) {
+ memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+ ah_attr->vlan_id = wc->vlan_id;
+ } else {
+ ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
+ ah_attr->dmac, &ah_attr->vlan_id);
+ if (ret)
+ return ret;
+ }
+ } else {
+ ah_attr->vlan_id = 0xffff;
+ }
+
+
ah_attr->dlid = wc->slid;
ah_attr->sl = wc->sl;
ah_attr->src_path_bits = wc->dlid_path_bits;
@@ -250,45 +303,20 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd,
srq->uobject = NULL;
srq->event_handler = srq_init_attr->event_handler;
srq->srq_context = srq_init_attr->srq_context;
- srq->ext.xrc.cq = NULL;
- srq->ext.xrc.xrcd = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&srq->usecnt, 0);
+ srq->srq_type = srq_init_attr->srq_type;
+ if (srq->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+ srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq;
+ atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+ atomic_inc(&srq->ext.xrc.cq->usecnt);
}
-
- return srq;
-}
-EXPORT_SYMBOL(ib_create_srq);
-
-struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd,
- struct ib_cq *xrc_cq,
- struct ib_xrcd *xrcd,
- struct ib_srq_init_attr *srq_init_attr)
-{
- struct ib_srq *srq;
-
- if (!pd->device->create_xrc_srq)
- return ERR_PTR(-ENOSYS);
-
- srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, srq_init_attr, NULL);
-
- if (!IS_ERR(srq)) {
- srq->device = pd->device;
- srq->pd = pd;
- srq->uobject = NULL;
- srq->event_handler = srq_init_attr->event_handler;
- srq->srq_context = srq_init_attr->srq_context;
- srq->ext.xrc.cq = xrc_cq;
- srq->ext.xrc.xrcd = xrcd;
atomic_inc(&pd->usecnt);
- atomic_inc(&xrcd->usecnt);
- atomic_inc(&xrc_cq->usecnt);
atomic_set(&srq->usecnt, 0);
}
return srq;
}
-EXPORT_SYMBOL(ib_create_xrc_srq);
+EXPORT_SYMBOL(ib_create_srq);
int ib_modify_srq(struct ib_srq *srq,
struct ib_srq_attr *srq_attr,
@@ -308,27 +336,39 @@ int ib_query_srq(struct ib_srq *srq,
}
EXPORT_SYMBOL(ib_query_srq);
+int ib_query_values(struct ib_device *device,
+ int q_values, struct ib_device_values *values)
+{
+ return device->query_values ?
+ device->query_values(device, q_values, values) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_values);
+
int ib_destroy_srq(struct ib_srq *srq)
{
struct ib_pd *pd;
- struct ib_cq *xrc_cq;
- struct ib_xrcd *xrcd;
+ enum ib_srq_type srq_type;
+ struct ib_xrcd *uninitialized_var(xrcd);
+ struct ib_cq *uninitialized_var(cq);
int ret;
if (atomic_read(&srq->usecnt))
return -EBUSY;
pd = srq->pd;
- xrc_cq = srq->ext.xrc.cq;
+ srq_type = srq->srq_type;
+ if (srq_type == IB_SRQT_XRC) {
xrcd = srq->ext.xrc.xrcd;
+ cq = srq->ext.xrc.cq;
+ }
ret = srq->device->destroy_srq(srq);
if (!ret) {
atomic_dec(&pd->usecnt);
- if (xrc_cq)
- atomic_dec(&xrc_cq->usecnt);
- if (xrcd)
+ if (srq_type == IB_SRQT_XRC) {
atomic_dec(&xrcd->usecnt);
+ atomic_dec(&cq->usecnt);
+ }
}
return ret;
@@ -337,32 +377,130 @@ EXPORT_SYMBOL(ib_destroy_srq);
/* Queue pairs */
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct ib_qp *qp = context;
+ unsigned long flags;
+
+ /* The code below must be synced with deletions of existing qps (ib_close_qp) --
+ * because a qp from the list may be closed during the scan, resulting in a kernel Oops.
+ */
+ spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+ list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+ if (event->element.qp->event_handler)
+ event->element.qp->event_handler(event, event->element.qp->qp_context);
+ spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+ void (*event_handler)(struct ib_event *, void *),
+ void *qp_context)
+{
+ struct ib_qp *qp;
+ unsigned long flags;
+
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->real_qp = real_qp;
+ atomic_inc(&real_qp->usecnt);
+ qp->device = real_qp->device;
+ qp->event_handler = event_handler;
+ qp->qp_context = qp_context;
+ qp->qp_num = real_qp->qp_num;
+ qp->qp_type = real_qp->qp_type;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_add(&qp->open_list, &real_qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+ struct ib_qp_open_attr *qp_open_attr)
+{
+ struct ib_qp *qp, *real_qp;
+
+ if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+ return ERR_PTR(-EINVAL);
+
+ qp = ERR_PTR(-EINVAL);
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+ if (real_qp->qp_num == qp_open_attr->qp_num) {
+ qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+ qp_open_attr->qp_context);
+ break;
+ }
+ }
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+ return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
struct ib_qp *ib_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *qp_init_attr)
{
- struct ib_qp *qp;
+ struct ib_qp *qp, *real_qp;
+ struct ib_device *device;
- qp = pd->device->create_qp(pd, qp_init_attr, NULL);
+ device = pd ? pd->device : qp_init_attr->xrcd->device;
+ qp = device->create_qp(pd, qp_init_attr, NULL);
if (!IS_ERR(qp)) {
- qp->device = pd->device;
- qp->pd = pd;
- qp->send_cq = qp_init_attr->send_cq;
- qp->recv_cq = qp_init_attr->recv_cq;
- qp->srq = qp_init_attr->srq;
+ qp->device = device;
+ qp->real_qp = qp;
qp->uobject = NULL;
+ qp->qp_type = qp_init_attr->qp_type;
+
+ atomic_set(&qp->usecnt, 0);
+ if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+ qp->event_handler = __ib_shared_qp_event_handler;
+ qp->qp_context = qp;
+ qp->pd = NULL;
+ qp->send_cq = qp->recv_cq = NULL;
+ qp->srq = NULL;
+ qp->xrcd = qp_init_attr->xrcd;
+ atomic_inc(&qp_init_attr->xrcd->usecnt);
+ INIT_LIST_HEAD(&qp->open_list);
+
+ real_qp = qp;
+ qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+ qp_init_attr->qp_context);
+ if (!IS_ERR(qp))
+ __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+ else
+ real_qp->device->destroy_qp(real_qp);
+ } else {
qp->event_handler = qp_init_attr->event_handler;
qp->qp_context = qp_init_attr->qp_context;
- qp->qp_type = qp_init_attr->qp_type;
- qp->xrcd = qp->qp_type == IB_QPT_XRC ?
- qp_init_attr->xrcd : NULL;
- atomic_inc(&pd->usecnt);
- atomic_inc(&qp_init_attr->send_cq->usecnt);
+ if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+ qp->recv_cq = NULL;
+ qp->srq = NULL;
+ } else {
+ qp->recv_cq = qp_init_attr->recv_cq;
atomic_inc(&qp_init_attr->recv_cq->usecnt);
- if (qp_init_attr->srq)
+ qp->srq = qp_init_attr->srq;
+ if (qp->srq)
atomic_inc(&qp_init_attr->srq->usecnt);
- if (qp->qp_type == IB_QPT_XRC)
- atomic_inc(&qp->xrcd->usecnt);
+ }
+
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->xrcd = NULL;
+
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ }
}
return qp;
@@ -371,8 +509,10 @@ EXPORT_SYMBOL(ib_create_qp);
static const struct {
int valid;
- enum ib_qp_attr_mask req_param[IB_QPT_RAW_PACKET + 1];
- enum ib_qp_attr_mask opt_param[IB_QPT_RAW_PACKET + 1];
+ enum ib_qp_attr_mask req_param[IB_QPT_MAX];
+ enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX];
+ enum ib_qp_attr_mask opt_param[IB_QPT_MAX];
+ enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX];
} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
[IB_QPS_RESET] = {
[IB_QPS_RESET] = { .valid = 1 },
@@ -389,13 +529,24 @@ static const struct {
[IB_QPT_RC] = (IB_QP_PKEY_INDEX |
IB_QP_PORT |
IB_QP_ACCESS_FLAGS),
- [IB_QPT_XRC] = (IB_QP_PKEY_INDEX |
+ [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_DC_KEY),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
IB_QP_PORT |
IB_QP_ACCESS_FLAGS),
[IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
IB_QP_QKEY),
[IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
IB_QP_QKEY),
+ },
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_GROUP_RSS,
+ [IB_QPT_RAW_PACKET] = IB_QP_GROUP_RSS
}
},
},
@@ -414,7 +565,13 @@ static const struct {
[IB_QPT_RC] = (IB_QP_PKEY_INDEX |
IB_QP_PORT |
IB_QP_ACCESS_FLAGS),
- [IB_QPT_XRC] = (IB_QP_PKEY_INDEX |
+ [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
IB_QP_PORT |
IB_QP_ACCESS_FLAGS),
[IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
@@ -436,13 +593,26 @@ static const struct {
IB_QP_RQ_PSN |
IB_QP_MAX_DEST_RD_ATOMIC |
IB_QP_MIN_RNR_TIMER),
- [IB_QPT_XRC] = (IB_QP_AV |
+ [IB_QPT_DC_INI] = (IB_QP_PATH_MTU |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_XRC_TGT] = (IB_QP_AV |
IB_QP_PATH_MTU |
IB_QP_DEST_QPN |
IB_QP_RQ_PSN |
IB_QP_MAX_DEST_RD_ATOMIC |
IB_QP_MIN_RNR_TIMER),
},
+ .req_param_add_eth = {
+ [IB_QPT_RC] = (IB_QP_SMAC),
+ [IB_QPT_UC] = (IB_QP_SMAC),
+ [IB_QPT_XRC_INI] = (IB_QP_SMAC),
+ [IB_QPT_XRC_TGT] = (IB_QP_SMAC)
+ },
.opt_param = {
[IB_QPT_UD] = (IB_QP_PKEY_INDEX |
IB_QP_QKEY),
@@ -452,13 +622,34 @@ static const struct {
[IB_QPT_RC] = (IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
IB_QP_PKEY_INDEX),
- [IB_QPT_XRC] = (IB_QP_ALT_PATH |
+ [IB_QPT_DC_INI] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
IB_QP_PKEY_INDEX),
[IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
IB_QP_QKEY),
[IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_AV,
+ },
+ .opt_param_add_eth = {
+ [IB_QPT_RC] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID),
+ [IB_QPT_UC] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID),
+ [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID),
+ [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID)
}
}
},
@@ -475,11 +666,17 @@ static const struct {
IB_QP_RNR_RETRY |
IB_QP_SQ_PSN |
IB_QP_MAX_QP_RD_ATOMIC),
- [IB_QPT_XRC] = (IB_QP_TIMEOUT |
+ [IB_QPT_DC_INI] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT |
IB_QP_RETRY_CNT |
IB_QP_RNR_RETRY |
IB_QP_SQ_PSN |
IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT |
+ IB_QP_SQ_PSN),
[IB_QPT_SMI] = IB_QP_SQ_PSN,
[IB_QPT_GSI] = IB_QP_SQ_PSN,
},
@@ -495,7 +692,16 @@ static const struct {
IB_QP_ACCESS_FLAGS |
IB_QP_MIN_RNR_TIMER |
IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC] = (IB_QP_CUR_STATE |
+ [IB_QPT_DC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
IB_QP_MIN_RNR_TIMER |
@@ -524,7 +730,16 @@ static const struct {
IB_QP_ALT_PATH |
IB_QP_PATH_MIG_STATE |
IB_QP_MIN_RNR_TIMER),
- [IB_QPT_XRC] = (IB_QP_CUR_STATE |
+ [IB_QPT_DC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
IB_QP_ACCESS_FLAGS |
IB_QP_ALT_PATH |
IB_QP_PATH_MIG_STATE |
@@ -541,7 +756,8 @@ static const struct {
[IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
[IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
[IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
- [IB_QPT_XRC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
}
@@ -564,7 +780,11 @@ static const struct {
IB_QP_ACCESS_FLAGS |
IB_QP_MIN_RNR_TIMER |
IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC] = (IB_QP_CUR_STATE |
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
IB_QP_MIN_RNR_TIMER |
@@ -597,12 +817,19 @@ static const struct {
IB_QP_PKEY_INDEX |
IB_QP_MIN_RNR_TIMER |
IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC] = (IB_QP_PORT |
+ [IB_QPT_XRC_INI] = (IB_QP_PORT |
IB_QP_AV |
IB_QP_TIMEOUT |
IB_QP_RETRY_CNT |
IB_QP_RNR_RETRY |
IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
IB_QP_MAX_DEST_RD_ATOMIC |
IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
@@ -640,7 +867,8 @@ static const struct {
};
int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
- enum ib_qp_type type, enum ib_qp_attr_mask mask)
+ enum ib_qp_type type, enum ib_qp_attr_mask mask,
+ enum rdma_link_layer ll)
{
enum ib_qp_attr_mask req_param, opt_param;
@@ -659,6 +887,13 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
req_param = qp_state_table[cur_state][next_state].req_param[type];
opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+ if (ll == IB_LINK_LAYER_ETHERNET) {
+ req_param |= qp_state_table[cur_state][next_state].
+ req_param_add_eth[type];
+ opt_param |= qp_state_table[cur_state][next_state].
+ opt_param_add_eth[type];
+ }
+
if ((mask & req_param) != req_param)
return 0;
@@ -673,7 +908,13 @@ int ib_modify_qp(struct ib_qp *qp,
struct ib_qp_attr *qp_attr,
int qp_attr_mask)
{
- return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL);
+ int ret;
+
+ ret = qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+ if (!ret && (qp_attr_mask & IB_QP_PORT))
+ qp->port_num = qp_attr->port_num;
+
+ return ret;
}
EXPORT_SYMBOL(ib_modify_qp);
@@ -683,35 +924,87 @@ int ib_query_qp(struct ib_qp *qp,
struct ib_qp_init_attr *qp_init_attr)
{
return qp->device->query_qp ?
- qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) :
+ qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
-ENOSYS;
}
EXPORT_SYMBOL(ib_query_qp);
+int ib_close_qp(struct ib_qp *qp)
+{
+ struct ib_qp *real_qp;
+ unsigned long flags;
+
+ real_qp = qp->real_qp;
+ if (real_qp == qp)
+ return -EINVAL;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_del(&qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ atomic_dec(&real_qp->usecnt);
+ kfree(qp);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+ struct ib_xrcd *xrcd;
+ struct ib_qp *real_qp;
+ int ret;
+
+ real_qp = qp->real_qp;
+ xrcd = real_qp->xrcd;
+
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ ib_close_qp(qp);
+ if (atomic_read(&real_qp->usecnt) == 0)
+ list_del(&real_qp->xrcd_list);
+ else
+ real_qp = NULL;
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+
+ if (real_qp) {
+ ret = ib_destroy_qp(real_qp);
+ if (!ret)
+ atomic_dec(&xrcd->usecnt);
+ else
+ __ib_insert_xrcd_qp(xrcd, real_qp);
+ }
+
+ return 0;
+}
+
int ib_destroy_qp(struct ib_qp *qp)
{
struct ib_pd *pd;
struct ib_cq *scq, *rcq;
struct ib_srq *srq;
- struct ib_xrcd *xrcd;
- enum ib_qp_type qp_type = qp->qp_type;
int ret;
+ if (atomic_read(&qp->usecnt))
+ return -EBUSY;
+
+ if (qp->real_qp != qp)
+ return __ib_destroy_shared_qp(qp);
+
pd = qp->pd;
scq = qp->send_cq;
rcq = qp->recv_cq;
srq = qp->srq;
- xrcd = qp->xrcd;
ret = qp->device->destroy_qp(qp);
if (!ret) {
+ if (pd)
atomic_dec(&pd->usecnt);
+ if (scq)
atomic_dec(&scq->usecnt);
+ if (rcq)
atomic_dec(&rcq->usecnt);
if (srq)
atomic_dec(&srq->usecnt);
- if (qp_type == IB_QPT_XRC)
- atomic_dec(&xrcd->usecnt);
}
return ret;
@@ -726,8 +1019,13 @@ struct ib_cq *ib_create_cq(struct ib_device *device,
void *cq_context, int cqe, int comp_vector)
{
struct ib_cq *cq;
+ struct ib_cq_init_attr attr = {
+ .cqe = cqe,
+ .comp_vector = comp_vector,
+ .flags = 0,
+ };
- cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+ cq = device->create_cq(device, &attr, NULL, NULL);
if (!IS_ERR(cq)) {
cq->device = device;
@@ -742,10 +1040,12 @@ struct ib_cq *ib_create_cq(struct ib_device *device,
}
EXPORT_SYMBOL(ib_create_cq);
-int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+int ib_modify_cq(struct ib_cq *cq,
+ struct ib_cq_attr *cq_attr,
+ int cq_attr_mask)
{
return cq->device->modify_cq ?
- cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+ cq->device->modify_cq(cq, cq_attr, cq_attr_mask) : -ENOSYS;
}
EXPORT_SYMBOL(ib_modify_cq);
@@ -770,6 +1070,11 @@ EXPORT_SYMBOL(ib_resize_cq);
struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
{
struct ib_mr *mr;
+ int err;
+
+ err = ib_check_mr_access(mr_access_flags);
+ if (err)
+ return ERR_PTR(err);
mr = pd->device->get_dma_mr(pd, mr_access_flags);
@@ -792,6 +1097,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
u64 *iova_start)
{
struct ib_mr *mr;
+ int err;
+
+ err = ib_check_mr_access(mr_access_flags);
+ if (err)
+ return ERR_PTR(err);
if (!pd->device->reg_phys_mr)
return ERR_PTR(-ENOSYS);
@@ -822,6 +1132,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr,
struct ib_pd *old_pd;
int ret;
+ ret = ib_check_mr_access(mr_access_flags);
+ if (ret)
+ return ret;
+
if (!mr->device->rereg_phys_mr)
return -ENOSYS;
@@ -867,6 +1181,45 @@ int ib_dereg_mr(struct ib_mr *mr)
}
EXPORT_SYMBOL(ib_dereg_mr);
+struct ib_mr *ib_create_mr(struct ib_pd *pd,
+ struct ib_mr_init_attr *mr_init_attr)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->create_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->create_mr(pd, mr_init_attr);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_create_mr);
+
+int ib_destroy_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ pd = mr->pd;
+ ret = mr->device->destroy_mr(mr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_mr);
+
struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
{
struct ib_mr *mr;
@@ -915,18 +1268,19 @@ EXPORT_SYMBOL(ib_free_fast_reg_page_list);
/* Memory windows */
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
{
struct ib_mw *mw;
if (!pd->device->alloc_mw)
return ERR_PTR(-ENOSYS);
- mw = pd->device->alloc_mw(pd);
+ mw = pd->device->alloc_mw(pd, type);
if (!IS_ERR(mw)) {
mw->device = pd->device;
mw->pd = pd;
mw->uobject = NULL;
+ mw->type = type;
atomic_inc(&pd->usecnt);
}
@@ -1000,58 +1354,58 @@ EXPORT_SYMBOL(ib_dealloc_fmr);
int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
{
+ int ret;
+
if (!qp->device->attach_mcast)
return -ENOSYS;
switch (rdma_node_get_transport(qp->device->node_type)) {
case RDMA_TRANSPORT_IB:
- if (qp->qp_type == IB_QPT_RAW_PACKET) {
- /* In raw Etherent mgids the 63 msb's should be 0 */
- if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
- return -EINVAL;
- } else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) &&
+ qp->qp_type != IB_QPT_RAW_PACKET)
return -EINVAL;
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
if (qp->qp_type != IB_QPT_RAW_PACKET)
return -EINVAL;
break;
}
- return qp->device->attach_mcast(qp, gid, lid);
+
+ ret = qp->device->attach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_inc(&qp->usecnt);
+ return ret;
}
EXPORT_SYMBOL(ib_attach_mcast);
int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
{
+ int ret;
+
if (!qp->device->detach_mcast)
return -ENOSYS;
switch (rdma_node_get_transport(qp->device->node_type)) {
case RDMA_TRANSPORT_IB:
- if (qp->qp_type == IB_QPT_RAW_PACKET) {
- /* In raw Etherent mgids the 63 msb's should be 0 */
- if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
- return -EINVAL;
- } else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) &&
+ qp->qp_type != IB_QPT_RAW_PACKET)
return -EINVAL;
break;
case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_SCIF:
+
if (qp->qp_type != IB_QPT_RAW_PACKET)
return -EINVAL;
break;
}
- return qp->device->detach_mcast(qp, gid, lid);
-}
-EXPORT_SYMBOL(ib_detach_mcast);
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
-{
- if (atomic_read(&xrcd->usecnt))
- return -EBUSY;
-
- return xrcd->device->dealloc_xrcd(xrcd);
+ ret = qp->device->detach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_dec(&qp->usecnt);
+ return ret;
}
-EXPORT_SYMBOL(ib_dealloc_xrcd);
+EXPORT_SYMBOL(ib_detach_mcast);
struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
{
@@ -1064,10 +1418,119 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
if (!IS_ERR(xrcd)) {
xrcd->device = device;
xrcd->inode = NULL;
- xrcd->uobject = NULL;
atomic_set(&xrcd->usecnt, 0);
+ mutex_init(&xrcd->tgt_qp_mutex);
+ INIT_LIST_HEAD(&xrcd->tgt_qp_list);
}
+
return xrcd;
}
EXPORT_SYMBOL(ib_alloc_xrcd);
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+ struct ib_qp *qp;
+ int ret;
+
+ if (atomic_read(&xrcd->usecnt))
+ return -EBUSY;
+
+ while (!list_empty(&xrcd->tgt_qp_list)) {
+ qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+ ret = ib_destroy_qp(qp);
+ if (ret)
+ return ret;
+ }
+
+ return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr,
+ int domain)
+{
+ struct ib_flow *flow_id;
+ if (!qp->device->create_flow)
+ return ERR_PTR(-ENOSYS);
+
+ flow_id = qp->device->create_flow(qp, flow_attr, domain);
+ if (!IS_ERR(flow_id))
+ atomic_inc(&qp->usecnt);
+ return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+ int err;
+ struct ib_qp *qp;
+
+ if (!flow_id)
+ return -EINVAL;
+ qp = flow_id->qp;
+ if (!qp->device->destroy_flow)
+ return -ENOSYS;
+ err = qp->device->destroy_flow(flow_id);
+ if (!err)
+ atomic_dec(&qp->usecnt);
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);
+
+struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct ib_dct *dct;
+
+ if (!pd->device->exp_create_dct)
+ return ERR_PTR(-ENOSYS);
+
+ dct = pd->device->exp_create_dct(pd, attr, udata);
+ if (!IS_ERR(dct)) {
+ dct->pd = pd;
+ dct->srq = attr->srq;
+ dct->cq = attr->cq;
+ atomic_inc(&dct->srq->usecnt);
+ atomic_inc(&dct->cq->usecnt);
+ atomic_inc(&dct->pd->usecnt);
+ }
+
+ return dct;
+}
+EXPORT_SYMBOL(ib_create_dct);
+
+int ib_destroy_dct(struct ib_dct *dct)
+{
+ int err;
+
+ if (!dct->device->exp_destroy_dct)
+ return -ENOSYS;
+
+ err = dct->device->exp_destroy_dct(dct);
+ if (!err) {
+ atomic_dec(&dct->srq->usecnt);
+ atomic_dec(&dct->cq->usecnt);
+ atomic_dec(&dct->pd->usecnt);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_dct);
+
+int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr)
+{
+ if (!dct->device->exp_query_dct)
+ return -ENOSYS;
+
+ return dct->device->exp_query_dct(dct, attr);
+}
+EXPORT_SYMBOL(ib_query_dct);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+ struct ib_mr_status *mr_status)
+{
+ return mr->device->check_mr_status ?
+ mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_check_mr_status);
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.c b/sys/ofed/drivers/infiniband/debug/memtrack.c
index 199b33b..7082856 100644
--- a/sys/ofed/drivers/infiniband/debug/memtrack.c
+++ b/sys/ofed/drivers/infiniband/debug/memtrack.c
@@ -24,12 +24,21 @@
#ifdef kmalloc
#undef kmalloc
#endif
+#ifdef kmemdup
+ #undef kmemdup
+#endif
#ifdef kfree
#undef kfree
#endif
#ifdef vmalloc
#undef vmalloc
#endif
+#ifdef vzalloc
+ #undef vzalloc
+#endif
+#ifdef vzalloc_node
+ #undef vzalloc_node
+#endif
#ifdef vfree
#undef vfree
#endif
@@ -39,16 +48,59 @@
#ifdef kmem_cache_free
#undef kmem_cache_free
#endif
+#ifdef ioremap
+ #undef ioremap
+#endif
+#ifdef io_mapping_create_wc
+ #undef io_mapping_create_wc
+#endif
+#ifdef io_mapping_free
+ #undef io_mapping_free
+#endif
+#ifdef ioremap_nocache
+ #undef ioremap_nocache
+#endif
+#ifdef iounmap
+ #undef iounmap
+#endif
+#ifdef alloc_pages
+ #undef alloc_pages
+#endif
+#ifdef free_pages
+ #undef free_pages
+#endif
+#ifdef get_page
+ #undef get_page
+#endif
+#ifdef put_page
+ #undef put_page
+#endif
+#ifdef create_workqueue
+ #undef create_workqueue
+#endif
+#ifdef create_rt_workqueue
+ #undef create_rt_workqueue
+#endif
+#ifdef create_freezeable_workqueue
+ #undef create_freezeable_workqueue
+#endif
+#ifdef create_singlethread_workqueue
+ #undef create_singlethread_workqueue
+#endif
+#ifdef destroy_workqueue
+ #undef destroy_workqueue
+#endif
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/vmalloc.h>
-#include <linux/version.h>
+#include <linux/mm.h>
#include <asm/uaccess.h>
#include <linux/proc_fs.h>
-#include <memtrack.h>
+#include <linux/random.h>
+#include "memtrack.h"
#include <linux/moduleparam.h>
@@ -67,7 +119,7 @@ MODULE_LICENSE("GPL");
bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */
static unsigned long track_mask = -1; /* effectively everything */
module_param(track_mask, ulong, 0444);
-MODULE_PARM_DESC(track_mask, "bitmask definenig what is tracked");
+MODULE_PARM_DESC(track_mask, "bitmask defining what is tracked");
/* if a bit is set then the corresponding allocation is strictly tracked.
That is, before inserting the whole range is checked to not overlap any
@@ -76,59 +128,95 @@ static unsigned long strict_track_mask = 0; /* no strict tracking */
module_param(strict_track_mask, ulong, 0444);
MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking");
-typedef struct memtrack_meminfo_st {
+/* Sets the frequency of allocations failures injections
+ if set to 0 all allocation should succeed */
+static unsigned int inject_freq = 0;
+module_param(inject_freq, uint, 0644);
+MODULE_PARM_DESC(inject_freq, "Error injection frequency, default is 0 (disabled)");
+
+static int random_mem = 1;
+module_param(random_mem, uint, 0644);
+MODULE_PARM_DESC(random_mem, "When set, randomize allocated memory, default is 1 (enabled)");
+
+struct memtrack_meminfo_t {
unsigned long addr;
unsigned long size;
unsigned long line_num;
- struct memtrack_meminfo_st *next;
+ unsigned long dev;
+ unsigned long addr2;
+ int direction;
+ struct memtrack_meminfo_t *next;
struct list_head list; /* used to link all items from a certain type together */
char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */
-} memtrack_meminfo_t;
+ char ext_info[32];
+};
static struct kmem_cache *meminfo_cache;
-typedef struct {
- memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ];
+struct tracked_obj_desc_t {
+ struct memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ];
spinlock_t hash_lock;
unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */
struct list_head tracked_objs_head; /* head of list of all objects */
int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */
-} tracked_obj_desc_t;
+};
-static tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES];
+static struct tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES];
static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = {
"kmalloc",
"vmalloc",
- "kmem_cache_alloc"
+ "kmem_cache_alloc",
+ "io_remap",
+ "create_workqueue",
+ "alloc_pages",
+ "ib_dma_map_single",
+ "ib_dma_map_page",
+ "ib_dma_map_sg"
};
-
static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = {
"kfree",
"vfree",
- "kmem_cache_free"
+ "kmem_cache_free",
+ "io_unmap",
+ "destory_workqueue",
+ "free_pages",
+ "ib_dma_unmap_single",
+ "ib_dma_unmap_page",
+ "ib_dma_unmap_sg"
};
-
-static inline const char *memtype_alloc_str(memtrack_memtype_t memtype)
+static inline const char *memtype_alloc_str(enum memtrack_memtype_t memtype)
{
switch (memtype) {
- case MEMTRACK_KMALLOC:
- case MEMTRACK_VMALLOC:
- case MEMTRACK_KMEM_OBJ:
+ case MEMTRACK_KMALLOC:
+ case MEMTRACK_VMALLOC:
+ case MEMTRACK_KMEM_OBJ:
+ case MEMTRACK_IOREMAP:
+ case MEMTRACK_WORK_QUEUE:
+ case MEMTRACK_PAGE_ALLOC:
+ case MEMTRACK_DMA_MAP_SINGLE:
+ case MEMTRACK_DMA_MAP_PAGE:
+ case MEMTRACK_DMA_MAP_SG:
return rsc_names[memtype];
default:
return "(Unknown allocation type)";
}
}
-static inline const char *memtype_free_str(memtrack_memtype_t memtype)
+static inline const char *memtype_free_str(enum memtrack_memtype_t memtype)
{
switch (memtype) {
- case MEMTRACK_KMALLOC:
- case MEMTRACK_VMALLOC:
- case MEMTRACK_KMEM_OBJ:
+ case MEMTRACK_KMALLOC:
+ case MEMTRACK_VMALLOC:
+ case MEMTRACK_KMEM_OBJ:
+ case MEMTRACK_IOREMAP:
+ case MEMTRACK_WORK_QUEUE:
+ case MEMTRACK_PAGE_ALLOC:
+ case MEMTRACK_DMA_MAP_SINGLE:
+ case MEMTRACK_DMA_MAP_PAGE:
+ case MEMTRACK_DMA_MAP_SG:
return rsc_free_names[memtype];
default:
return "(Unknown allocation type)";
@@ -138,56 +226,56 @@ static inline const char *memtype_free_str(memtrack_memtype_t memtype)
/*
* overlap_a_b
*/
-static int overlap_a_b(unsigned long a_start, unsigned long a_end,
+static inline int overlap_a_b(unsigned long a_start, unsigned long a_end,
unsigned long b_start, unsigned long b_end)
{
- if ((b_start > a_end) || (a_start > b_end)) {
+ if ((b_start > a_end) || (a_start > b_end))
return 0;
- }
+
return 1;
}
/*
* check_overlap
*/
-static void check_overlap(memtrack_memtype_t memtype,
- memtrack_meminfo_t * mem_info_p,
- tracked_obj_desc_t * obj_desc_p)
+static void check_overlap(enum memtrack_memtype_t memtype,
+ struct memtrack_meminfo_t *mem_info_p,
+ struct tracked_obj_desc_t *obj_desc_p)
{
struct list_head *pos, *next;
- memtrack_meminfo_t *cur;
+ struct memtrack_meminfo_t *cur;
unsigned long start_a, end_a, start_b, end_b;
- list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) {
- cur = list_entry(pos, memtrack_meminfo_t, list);
-
start_a = mem_info_p->addr;
end_a = mem_info_p->addr + mem_info_p->size - 1;
+
+ list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) {
+ cur = list_entry(pos, struct memtrack_meminfo_t, list);
+
start_b = cur->addr;
end_b = cur->addr + cur->size - 1;
- if (overlap_a_b(start_a, end_a, start_b, end_b)) {
- printk
- ("%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n",
+ if (overlap_a_b(start_a, end_a, start_b, end_b))
+ printk(KERN_ERR "%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n",
memtype_alloc_str(memtype), mem_info_p->addr,
mem_info_p->addr + mem_info_p->size - 1, cur->addr,
cur->addr + cur->size - 1);
}
- }
}
/* Invoke on memory allocation */
-void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
- unsigned long size, const char *filename,
+void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev,
+ unsigned long addr, unsigned long size, unsigned long addr2,
+ int direction, const char *filename,
const unsigned long line_num, int alloc_flags)
{
unsigned long hash_val;
- memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p;
- tracked_obj_desc_t *obj_desc_p;
+ struct memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p;
+ struct tracked_obj_desc_t *obj_desc_p;
unsigned long flags;
if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk("%s: Invalid memory type (%d)\n", __func__, memtype);
+ printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
return;
}
@@ -199,11 +287,9 @@ void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
hash_val = addr % MEMTRACK_HASH_SZ;
- new_mem_info_p = (memtrack_meminfo_t *)
- kmem_cache_alloc(meminfo_cache, alloc_flags);
+ new_mem_info_p = (struct memtrack_meminfo_t *)kmem_cache_alloc(meminfo_cache, alloc_flags);
if (new_mem_info_p == NULL) {
- printk
- ("%s: Failed allocating kmem_cache item for new mem_info. "
+ printk(KERN_ERR "%s: Failed allocating kmem_cache item for new mem_info. "
"Lost tracking on allocation at %s:%lu...\n", __func__,
filename, line_num);
return;
@@ -211,26 +297,34 @@ void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
/* save allocation properties */
new_mem_info_p->addr = addr;
new_mem_info_p->size = size;
+ new_mem_info_p->dev = dev;
+ new_mem_info_p->addr2 = addr2;
+ new_mem_info_p->direction = direction;
+
new_mem_info_p->line_num = line_num;
+ *new_mem_info_p->ext_info = '\0';
/* Make sure that we will print out the path tail if the given filename is longer
* than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file
* in the printout -- only the path head!
*/
- if (strlen(filename) > MAX_FILENAME_LEN) {
+ if (strlen(filename) > MAX_FILENAME_LEN)
strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN);
- } else {
+ else
strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN);
- }
+
new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */
memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
/* make sure given memory location is not already allocated */
+ if ((memtype != MEMTRACK_DMA_MAP_SINGLE) && (memtype != MEMTRACK_DMA_MAP_PAGE) &&
+ (memtype != MEMTRACK_DMA_MAP_SG)) {
+
+ /* make sure given memory location is not already allocated */
cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
while (cur_mem_info_p != NULL) {
- if (cur_mem_info_p->addr == addr) {
+ if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) {
/* Found given address in the database */
- printk
- ("mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n",
+ printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n",
__func__, filename, line_num,
memtype_alloc_str(memtype), addr,
cur_mem_info_p->filename,
@@ -241,31 +335,33 @@ void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
}
cur_mem_info_p = cur_mem_info_p->next;
}
+ }
/* not found - we can put in the hash bucket */
/* link as first */
new_mem_info_p->next = obj_desc_p->mem_hash[hash_val];
obj_desc_p->mem_hash[hash_val] = new_mem_info_p;
- if (obj_desc_p->strict_track) {
+ if (obj_desc_p->strict_track)
check_overlap(memtype, new_mem_info_p, obj_desc_p);
- }
obj_desc_p->count += size;
list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head);
memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
return;
}
+EXPORT_SYMBOL(memtrack_alloc);
/* Invoke on memory free */
-void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
+void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev,
+ unsigned long addr, unsigned long size, int direction,
const char *filename, const unsigned long line_num)
{
unsigned long hash_val;
- memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p;
- tracked_obj_desc_t *obj_desc_p;
+ struct memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p;
+ struct tracked_obj_desc_t *obj_desc_p;
unsigned long flags;
if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk("%s: Invalid memory type (%d)\n", __func__, memtype);
+ printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
return;
}
@@ -282,13 +378,27 @@ void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
prev_mem_info_p = NULL;
cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
while (cur_mem_info_p != NULL) {
- if (cur_mem_info_p->addr == addr) {
- /* Found given address in the database - remove from the bucket/list */
- if (prev_mem_info_p == NULL) {
+ if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) {
+ /* Found given address in the database */
+ if ((memtype == MEMTRACK_DMA_MAP_SINGLE) || (memtype == MEMTRACK_DMA_MAP_PAGE) ||
+ (memtype == MEMTRACK_DMA_MAP_SG)) {
+ if (direction != cur_mem_info_p->direction)
+ printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad direction for addr 0x%lX: alloc:0x%x, free:0x%x (allocated in %s::%lu)\n",
+ __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->direction, direction,
+ cur_mem_info_p->filename, cur_mem_info_p->line_num);
+
+ if (size != cur_mem_info_p->size)
+ printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad size for addr 0x%lX: size:%lu, free:%lu (allocated in %s::%lu)\n",
+ __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->size, size,
+ cur_mem_info_p->filename, cur_mem_info_p->line_num);
+ }
+
+ /* Remove from the bucket/list */
+ if (prev_mem_info_p == NULL)
obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */
- } else {
+ else
prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */
- }
+
list_del(&cur_mem_info_p->list);
obj_desc_p->count -= cur_mem_info_p->size;
@@ -301,64 +411,317 @@ void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
}
/* not found */
- printk
- ("mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n",
- __func__, filename, line_num, memtype_free_str(memtype), addr);
+ printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX, device=0x%lX\n",
+ __func__, filename, line_num, memtype_free_str(memtype), addr, dev);
memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
return;
}
+EXPORT_SYMBOL(memtrack_free);
+
+/*
+ * This function recognizes allocations which
+ * may be released by kernel (e.g. skb) and
+ * therefore not trackable by memtrack.
+ * The allocations are recognized by the name
+ * of their calling function.
+ */
+int is_non_trackable_alloc_func(const char *func_name)
+{
+ static const char * const str_str_arr[] = {
+ /* functions containing these strings consider non trackable */
+ "skb",
+ };
+ static const char * const str_str_excep_arr[] = {
+ /* functions which are exception to the str_str_arr table */
+ "ipoib_cm_skb_too_long"
+ };
+ static const char * const str_cmp_arr[] = {
+ /* functions that allocate SKBs */
+ "mlx4_en_alloc_frags",
+ "mlx4_en_alloc_frag",
+ "mlx4_en_init_allocator",
+ "mlx4_en_free_frag",
+ "mlx4_en_free_rx_desc",
+ "mlx4_en_destroy_allocator",
+ "mlx4_en_complete_rx_desc",
+ /* vnic skb functions */
+ "free_single_frag",
+ "vnic_alloc_rx_skb",
+ "vnic_rx_skb",
+ "vnic_alloc_frag",
+ "vnic_empty_rx_entry",
+ "vnic_init_allocator",
+ "vnic_destroy_allocator",
+ "sdp_post_recv",
+ "sdp_rx_ring_purge",
+ "sdp_post_srcavail",
+ "sk_stream_alloc_page",
+ "update_send_head",
+ "sdp_bcopy_get",
+ "sdp_destroy_resources",
+
+ /* function that allocate memory for RDMA device context */
+ "ib_alloc_device"
+ };
+ size_t str_str_arr_size = sizeof(str_str_arr)/sizeof(char *);
+ size_t str_str_excep_size = sizeof(str_str_excep_arr)/sizeof(char *);
+ size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *);
+
+ int i, j;
+
+ for (i = 0; i < str_str_arr_size; ++i)
+ if (strstr(func_name, str_str_arr[i])) {
+ for (j = 0; j < str_str_excep_size; ++j)
+ if (!strcmp(func_name, str_str_excep_arr[j]))
+ return 0;
+ return 1;
+ }
+ for (i = 0; i < str_cmp_arr_size; ++i)
+ if (!strcmp(func_name, str_cmp_arr[i]))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(is_non_trackable_alloc_func);
+
+/*
+ * In some cases we need to free a memory
+ * we defined as "non trackable" (see
+ * is_non_trackable_alloc_func).
+ * This function recognizes such releases
+ * by the name of their calling function.
+ */
+int is_non_trackable_free_func(const char *func_name)
+{
+
+ static const char * const str_cmp_arr[] = {
+ /* function that deallocate memory for RDMA device context */
+ "ib_dealloc_device"
+ };
+ size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *);
+
+ int i;
+
+ for (i = 0; i < str_cmp_arr_size; ++i)
+ if (!strcmp(func_name, str_cmp_arr[i]))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(is_non_trackable_free_func);
+
+
+/* WA - In this function handles confirm
+ the the function name is
+ '__ib_umem_release' or 'ib_umem_get'
+ In this case we won't track the
+ memory there because the kernel
+ was the one who allocated it.
+ Return value:
+ 1 - if the function name is match, else 0 */
+int is_umem_put_page(const char *func_name)
+{
+ const char func_str[18] = "__ib_umem_release";
+ /* In case of error flow put_page is called as part of ib_umem_get */
+ const char func_str1[12] = "ib_umem_get";
+
+ return ((strstr(func_name, func_str) != NULL) ||
+ (strstr(func_name, func_str1) != NULL)) ? 1 : 0;
+}
+EXPORT_SYMBOL(is_umem_put_page);
+
+/* Check page order size
+ When Freeing a page allocation it checks whether
+ we are trying to free the same size
+ we asked to allocate */
+int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr,
+ unsigned long size, const char *filename,
+ const unsigned long line_num)
+{
+ unsigned long hash_val;
+ struct memtrack_meminfo_t *cur_mem_info_p;
+ struct tracked_obj_desc_t *obj_desc_p;
+ unsigned long flags;
+ int ret = 0;
+
+ if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+ printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
+ return 1;
+ }
+
+ if (!tracked_objs_arr[memtype]) {
+ /* object is not tracked */
+ return 1;
+ }
+ obj_desc_p = tracked_objs_arr[memtype];
+
+ hash_val = addr % MEMTRACK_HASH_SZ;
+
+ memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
+ /* find mem_info of given memory location */
+ cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
+ while (cur_mem_info_p != NULL) {
+ if (cur_mem_info_p->addr == addr) {
+ /* Found given address in the database - check size */
+ if (cur_mem_info_p->size != size) {
+ printk(KERN_ERR "mtl size inconsistency: %s: %s::%lu: try to %s at address=0x%lX with size %lu while was created with size %lu\n",
+ __func__, filename, line_num, memtype_free_str(memtype),
+ addr, size, cur_mem_info_p->size);
+ snprintf(cur_mem_info_p->ext_info, sizeof(cur_mem_info_p->ext_info),
+ "invalid free size %lu\n", size);
+ ret = 1;
+ }
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return ret;
+ }
+ cur_mem_info_p = cur_mem_info_p->next;
+ }
+
+ /* not found - This function will not give any indication
+ but will only check the correct size\order
+ For inconsistency the 'free' function will check that */
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return 1;
+}
+EXPORT_SYMBOL(memtrack_check_size);
+
+/* Search for a specific addr whether it exist in the
+ current data-base.
+ It will print an error msg if we get an unexpected result,
+ Return value: 0 - if addr exist, else 1 */
+int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist,
+ const char *filename, const unsigned long line_num)
+{
+ unsigned long hash_val;
+ struct memtrack_meminfo_t *cur_mem_info_p;
+ struct tracked_obj_desc_t *obj_desc_p;
+ unsigned long flags;
+
+ if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+ printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
+ return 1;
+ }
+
+ if (!tracked_objs_arr[memtype]) {
+ /* object is not tracked */
+ return 0;
+ }
+ obj_desc_p = tracked_objs_arr[memtype];
+
+ hash_val = addr % MEMTRACK_HASH_SZ;
+
+ memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
+ /* find mem_info of given memory location */
+ cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
+ while (cur_mem_info_p != NULL) {
+ if (cur_mem_info_p->addr == addr) {
+ /* Found given address in the database - exiting */
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return 0;
+ }
+ cur_mem_info_p = cur_mem_info_p->next;
+ }
+
+ /* not found */
+ if (expect_exist)
+ printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n",
+ __func__, filename, line_num, memtype_free_str(memtype), addr);
+
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return 1;
+}
+EXPORT_SYMBOL(memtrack_is_new_addr);
+
+/* Return current page reference counter */
+int memtrack_get_page_ref_count(unsigned long addr)
+{
+ unsigned long hash_val;
+ struct memtrack_meminfo_t *cur_mem_info_p;
+ struct tracked_obj_desc_t *obj_desc_p;
+ unsigned long flags;
+ /* This function is called only for page allocation */
+ enum memtrack_memtype_t memtype = MEMTRACK_PAGE_ALLOC;
+ int ref_conut = 0;
+
+ if (!tracked_objs_arr[memtype]) {
+ /* object is not tracked */
+ return ref_conut;
+ }
+ obj_desc_p = tracked_objs_arr[memtype];
+
+ hash_val = addr % MEMTRACK_HASH_SZ;
+
+ memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
+ /* find mem_info of given memory location */
+ cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
+ while (cur_mem_info_p != NULL) {
+ if (cur_mem_info_p->addr == addr) {
+ /* Found given address in the database - check ref-count */
+ struct page *page = (struct page *)(cur_mem_info_p->addr);
+ ref_conut = atomic_read(&page->_count);
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return ref_conut;
+ }
+ cur_mem_info_p = cur_mem_info_p->next;
+ }
+
+ /* not found */
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return ref_conut;
+}
+EXPORT_SYMBOL(memtrack_get_page_ref_count);
/* Report current allocations status (for all memory types) */
static void memtrack_report(void)
{
- memtrack_memtype_t memtype;
+ enum memtrack_memtype_t memtype;
unsigned long cur_bucket;
- memtrack_meminfo_t *cur_mem_info_p;
+ struct memtrack_meminfo_t *cur_mem_info_p;
int serial = 1;
- tracked_obj_desc_t *obj_desc_p;
+ struct tracked_obj_desc_t *obj_desc_p;
unsigned long flags;
+ unsigned long detected_leaks = 0;
- printk("%s: Currently known allocations:\n", __func__);
+ printk(KERN_INFO "%s: Currently known allocations:\n", __func__);
for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) {
if (tracked_objs_arr[memtype]) {
- printk("%d) %s:\n", serial, memtype_alloc_str(memtype));
+ printk(KERN_INFO "%d) %s:\n", serial, memtype_alloc_str(memtype));
obj_desc_p = tracked_objs_arr[memtype];
/* Scan all buckets to find existing allocations */
/* TBD: this may be optimized by holding a linked list of all hash items */
- for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ;
- cur_bucket++) {
+ for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) {
memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */
- cur_mem_info_p =
- obj_desc_p->mem_hash[cur_bucket];
+ cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket];
while (cur_mem_info_p != NULL) { /* scan bucket */
- printk("%s::%lu: %s(%lu)==%lX\n",
+ printk(KERN_INFO "%s::%lu: %s(%lu)==%lX dev=%lX %s\n",
cur_mem_info_p->filename,
cur_mem_info_p->line_num,
memtype_alloc_str(memtype),
cur_mem_info_p->size,
- cur_mem_info_p->addr);
+ cur_mem_info_p->addr,
+ cur_mem_info_p->dev,
+ cur_mem_info_p->ext_info);
cur_mem_info_p = cur_mem_info_p->next;
+ ++ detected_leaks;
} /* while cur_mem_info_p */
memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
} /* for cur_bucket */
serial++;
}
} /* for memtype */
+ printk(KERN_INFO "%s: Summary: %lu leak(s) detected\n", __func__, detected_leaks);
}
static struct proc_dir_entry *memtrack_tree;
-static memtrack_memtype_t get_rsc_by_name(const char *name)
+static enum memtrack_memtype_t get_rsc_by_name(const char *name)
{
- memtrack_memtype_t i;
+ enum memtrack_memtype_t i;
- for (i=0; i<MEMTRACK_NUM_OF_MEMTYPES; ++i) {
- if (strcmp(name, rsc_names[i]) == 0) {
+ for (i = 0; i < MEMTRACK_NUM_OF_MEMTYPES; ++i) {
+ if (strcmp(name, rsc_names[i]) == 0)
return i;
}
- }
return i;
}
@@ -375,44 +738,41 @@ static ssize_t memtrack_read(struct file *filp,
static int file_len;
int _read, to_ret, left;
const char *fname;
- memtrack_memtype_t memtype;
+ enum memtrack_memtype_t memtype;
if (pos < 0)
return -EINVAL;
- fname= filp->f_dentry->d_name.name;
+ fname = filp->f_dentry->d_name.name;
- memtype= get_rsc_by_name(fname);
+ memtype = get_rsc_by_name(fname);
if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk("invalid file name\n");
+ printk(KERN_ERR "invalid file name\n");
return -EINVAL;
}
- if ( pos == 0 ) {
+ if (pos == 0) {
memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags);
- cur= tracked_objs_arr[memtype]->count;
+ cur = tracked_objs_arr[memtype]->count;
memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags);
_read = sprintf(kbuf, "%lu\n", cur);
- if ( _read < 0 ) {
+ if (_read < 0)
return _read;
- }
- else {
+ else
file_len = _read;
}
- }
left = file_len - pos;
to_ret = (left < size) ? left : size;
- if ( copy_to_user(buf, kbuf+pos, to_ret) ) {
+ if (copy_to_user(buf, kbuf+pos, to_ret))
return -EFAULT;
- }
else {
*offset = pos + to_ret;
return to_ret;
}
}
-static struct file_operations memtrack_proc_fops = {
+static const struct file_operations memtrack_proc_fops = {
.read = memtrack_read,
};
@@ -426,30 +786,28 @@ static int create_procfs_tree(void)
unsigned long bit_mask;
dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL);
- if ( !dir_ent ) {
+ if (!dir_ent)
return -1;
- }
memtrack_tree = dir_ent;
- for (i=0, bit_mask=1; i<MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask<<=1) {
+ for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) {
if (bit_mask & track_mask) {
proc_ent = create_proc_entry(rsc_names[i], S_IRUGO, memtrack_tree);
- if ( !proc_ent )
+ if (!proc_ent)
goto undo_create_root;
- proc_ent->proc_fops = &memtrack_proc_fops;
+ proc_ent->proc_fops = &memtrack_proc_fops;
}
}
goto exit_ok;
undo_create_root:
- for (j=0, bit_mask=1; j<i; ++j, bit_mask<<=1) {
- if (bit_mask & track_mask) {
+ for (j = 0, bit_mask = 1; j < i; ++j, bit_mask <<= 1) {
+ if (bit_mask & track_mask)
remove_proc_entry(rsc_names[j], memtrack_tree);
}
- }
remove_proc_entry(memtrack_proc_entry_name, NULL);
return -1;
@@ -463,30 +821,48 @@ static void destroy_procfs_tree(void)
int i;
unsigned long bit_mask;
- for (i=0, bit_mask=1; i<MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask<<=1) {
- if (bit_mask & track_mask) {
+ for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) {
+ if (bit_mask & track_mask)
remove_proc_entry(rsc_names[i], memtrack_tree);
- }
+
}
remove_proc_entry(memtrack_proc_entry_name, NULL);
}
+int memtrack_inject_error(void)
+{
+ int val = 0;
+
+ if (inject_freq) {
+ if (!(random32() % inject_freq))
+ val = 1;
+ }
+
+ return val;
+}
+EXPORT_SYMBOL(memtrack_inject_error);
+
+int memtrack_randomize_mem(void)
+{
+ return random_mem;
+}
+EXPORT_SYMBOL(memtrack_randomize_mem);
/* module entry points */
int init_module(void)
{
- memtrack_memtype_t i;
+ enum memtrack_memtype_t i;
int j;
unsigned long bit_mask;
/* create a cache for the memtrack_meminfo_t strcutures */
meminfo_cache = kmem_cache_create("memtrack_meminfo_t",
- sizeof(memtrack_meminfo_t), 0,
+ sizeof(struct memtrack_meminfo_t), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!meminfo_cache) {
- printk("memtrack::%s: failed to allocate meminfo cache\n", __func__);
+ printk(KERN_ERR "memtrack::%s: failed to allocate meminfo cache\n", __func__);
return -1;
}
@@ -494,49 +870,43 @@ int init_module(void)
memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr));
/* create a tracking object descriptor for all required objects */
- for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES;
- ++i, bit_mask <<= 1) {
+ for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) {
if (bit_mask & track_mask) {
- tracked_objs_arr[i] =
- vmalloc(sizeof(tracked_obj_desc_t));
+ tracked_objs_arr[i] = vmalloc(sizeof(struct tracked_obj_desc_t));
if (!tracked_objs_arr[i]) {
- printk("memtrack: failed to allocate tracking object\n");
+ printk(KERN_ERR "memtrack: failed to allocate tracking object\n");
goto undo_cache_create;
}
- memset(tracked_objs_arr[i], 0, sizeof(tracked_obj_desc_t));
+ memset(tracked_objs_arr[i], 0, sizeof(struct tracked_obj_desc_t));
spin_lock_init(&tracked_objs_arr[i]->hash_lock);
INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head);
- if (bit_mask & strict_track_mask) {
+ if (bit_mask & strict_track_mask)
tracked_objs_arr[i]->strict_track = 1;
- } else {
+ else
tracked_objs_arr[i]->strict_track = 0;
}
}
- }
- if ( create_procfs_tree() ) {
- printk("%s: create_procfs_tree() failed\n", __FILE__);
+ if (create_procfs_tree()) {
+ printk(KERN_ERR "%s: create_procfs_tree() failed\n", __FILE__);
goto undo_cache_create;
}
-
- printk("memtrack::%s done.\n", __func__);
+ printk(KERN_INFO "memtrack::%s done.\n", __func__);
return 0;
undo_cache_create:
- for (j=0; j<i; ++j) {
- if (tracked_objs_arr[j]) {
+ for (j = 0; j < i; ++j) {
+ if (tracked_objs_arr[j])
vfree(tracked_objs_arr[j]);
}
- }
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
- if (kmem_cache_destroy(meminfo_cache) != 0) {
- printk("Failed on kmem_cache_destroy !\n");
- }
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19)
+ if (kmem_cache_destroy(meminfo_cache) != 0)
+ printk(KERN_ERR "Failed on kmem_cache_destroy!\n");
#else
kmem_cache_destroy(meminfo_cache);
#endif
@@ -546,10 +916,10 @@ undo_cache_create:
void cleanup_module(void)
{
- memtrack_memtype_t memtype;
+ enum memtrack_memtype_t memtype;
unsigned long cur_bucket;
- memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p;
- tracked_obj_desc_t *obj_desc_p;
+ struct memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p;
+ struct tracked_obj_desc_t *obj_desc_p;
unsigned long flags;
@@ -564,15 +934,12 @@ void cleanup_module(void)
/* TBD: this may be optimized by holding a linked list of all hash items */
if (tracked_objs_arr[memtype]) {
obj_desc_p = tracked_objs_arr[memtype];
- for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ;
- cur_bucket++) {
+ for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) {
memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */
- cur_mem_info_p =
- obj_desc_p->mem_hash[cur_bucket];
+ cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket];
while (cur_mem_info_p != NULL) { /* scan bucket */
next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */
- kmem_cache_free(meminfo_cache,
- cur_mem_info_p);
+ kmem_cache_free(meminfo_cache, cur_mem_info_p);
cur_mem_info_p = next_mem_info_p;
} /* while cur_mem_info_p */
memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
@@ -581,20 +948,11 @@ void cleanup_module(void)
}
} /* for memtype */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
- if (kmem_cache_destroy(meminfo_cache) != 0) {
- printk
- ("memtrack::cleanup_module: Failed on kmem_cache_destroy !\n");
- }
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19)
+ if (kmem_cache_destroy(meminfo_cache) != 0)
+ printk(KERN_ERR "memtrack::cleanup_module: Failed on kmem_cache_destroy!\n");
#else
kmem_cache_destroy(meminfo_cache);
#endif
- printk("memtrack::cleanup_module done.\n");
+ printk(KERN_INFO "memtrack::cleanup_module done.\n");
}
-
-EXPORT_SYMBOL(memtrack_alloc);
-EXPORT_SYMBOL(memtrack_free);
-
-//module_init(memtrack_init)
-//module_exit(memtrack_exit)
-
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.h b/sys/ofed/drivers/infiniband/debug/memtrack.h
index e443a31..76265ae 100644
--- a/sys/ofed/drivers/infiniband/debug/memtrack.h
+++ b/sys/ofed/drivers/infiniband/debug/memtrack.h
@@ -22,24 +22,85 @@
#ifndef H_MEMTRACK_H
#define H_MEMTRACK_H
-typedef enum {
+enum memtrack_memtype_t {
MEMTRACK_KMALLOC,
MEMTRACK_VMALLOC,
MEMTRACK_KMEM_OBJ,
+ MEMTRACK_IOREMAP, /* IO-RE/UN-MAP */
+ MEMTRACK_WORK_QUEUE, /* Handle work-queue create & destroy */
+ MEMTRACK_PAGE_ALLOC, /* Handle page allocation and free */
+ MEMTRACK_DMA_MAP_SINGLE,/* Handle ib_dma_single map and unmap */
+ MEMTRACK_DMA_MAP_PAGE, /* Handle ib_dma_page map and unmap */
+ MEMTRACK_DMA_MAP_SG, /* Handle ib_dma_sg map and unmap with and without attributes */
MEMTRACK_NUM_OF_MEMTYPES
-} memtrack_memtype_t;
+};
/* Invoke on memory allocation */
-void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
- unsigned long size, const char *filename,
+void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev,
+ unsigned long addr, unsigned long size, unsigned long addr2,
+ int direction, const char *filename,
const unsigned long line_num, int alloc_flags);
/* Invoke on memory free */
-void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
+void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev,
+ unsigned long addr, unsigned long size, int direction,
const char *filename, const unsigned long line_num);
+/*
+ * This function recognizes allocations which
+ * may be released by kernel (e.g. skb & vnic) and
+ * therefore not trackable by memtrack.
+ * The allocations are recognized by the name
+ * of their calling function.
+ */
+int is_non_trackable_alloc_func(const char *func_name);
+/*
+ * In some cases we need to free a memory
+ * we defined as "non trackable" (see
+ * is_non_trackable_alloc_func).
+ * This function recognizes such releases
+ * by the name of their calling function.
+ */
+int is_non_trackable_free_func(const char *func_name);
+
+/* WA - In this function handles confirm
+ the the function name is
+ '__ib_umem_release' or 'ib_umem_get'
+ In this case we won't track the
+ memory there because the kernel
+ was the one who allocated it.
+ Return value:
+ 1 - if the function name is match, else 0 */
+int is_umem_put_page(const char *func_name);
+
+/* Check page order size
+ When Freeing a page allocation it checks whether
+ we are trying to free the same amount of pages
+ we ask to allocate (In log2(order)).
+ In case an error if found it will print
+ an error msg */
+int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr,
+ unsigned long size, const char *filename,
+ const unsigned long line_num);
+
+/* Search for a specific addr whether it exist in the
+ current data-base.
+ If not it will print an error msg,
+ Return value: 0 - if addr exist, else 1 */
+int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist,
+ const char *filename, const unsigned long line_num);
+
+/* Return current page reference counter */
+int memtrack_get_page_ref_count(unsigned long addr);
+
/* Report current allocations status (for all memory types) */
/* we do not export this function since it is used by cleanup_module only */
/* void memtrack_report(void); */
+/* Allow support of error injections */
+int memtrack_inject_error(void);
+
+/* randomize allocated memory */
+int memtrack_randomize_mem(void);
+
#endif
diff --git a/sys/ofed/drivers/infiniband/debug/mtrack.h b/sys/ofed/drivers/infiniband/debug/mtrack.h
index 337d9c3..5c0cd20 100644
--- a/sys/ofed/drivers/infiniband/debug/mtrack.h
+++ b/sys/ofed/drivers/infiniband/debug/mtrack.h
@@ -1,46 +1,84 @@
#ifndef __mtrack_h_
#define __mtrack_h_
-#include <memtrack.h>
+#include "memtrack.h"
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/io.h> /* For ioremap_nocache, ioremap, iounmap */
+#include <linux/random.h>
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 27)
+# include <linux/io-mapping.h> /* For ioremap_nocache, ioremap, iounmap */
+#endif
+#include <linux/mm.h> /* For all page handling */
+#include <linux/workqueue.h> /* For all work-queue handling */
+#include <linux/scatterlist.h> /* For using scatterlists */
+#include <linux/skbuff.h> /* For skbufs handling */
+#include <asm/uaccess.h> /* For copy from/to user */
+
+#define MEMTRACK_ERROR_INJECTION_MESSAGE(file, line, func) ({ \
+ printk(KERN_ERR "%s failure injected at %s:%d\n", func, file, line); \
+ dump_stack(); \
+})
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 14)
#define RDMA_KZALLOC_H
#define kzalloc(size, flags) ({ \
- void *__memtrack_kz_addr; \
+ void *__memtrack_kz_addr = NULL; \
\
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\
+ else \
__memtrack_kz_addr = kmalloc(size, flags); \
- if ( __memtrack_kz_addr ) { \
- memset( __memtrack_kz_addr, 0, size) ; \
+ if (__memtrack_kz_addr && !is_non_trackable_alloc_func(__func__)) { \
+ memset(__memtrack_kz_addr, 0, size); \
} \
__memtrack_kz_addr; \
})
#else
#define kzalloc(size, flags) ({ \
- void *__memtrack_addr; \
+ void *__memtrack_addr = NULL; \
\
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\
+ else \
__memtrack_addr = kzalloc(size, flags); \
- if ( __memtrack_addr && (size)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, flags); \
+ if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \
} \
__memtrack_addr; \
})
#endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#define kzalloc_node(size, flags, node) ({ \
+ void *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc_node"); \
+ else \
+ __memtrack_addr = kzalloc_node(size, flags, node); \
+ if (__memtrack_addr && (size) && \
+ !is_non_trackable_alloc_func(__func__)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \
+ } \
+ __memtrack_addr; \
+})
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19)
#define kcalloc(n, size, flags) kzalloc((n)*(size), flags)
#else
#define kcalloc(n, size, flags) ({ \
- void *__memtrack_addr; \
+ void *__memtrack_addr = NULL; \
\
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kcalloc");\
+ else \
__memtrack_addr = kcalloc(n, size, flags); \
- if ( __memtrack_addr && (size)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), (n)*(size), __FILE__, __LINE__, flags); \
+ if (__memtrack_addr && (size)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \
} \
__memtrack_addr; \
})
@@ -50,76 +88,208 @@
#ifdef ZERO_OR_NULL_PTR
#define kmalloc(sz, flgs) ({ \
- void *__memtrack_addr; \
+ void *__memtrack_addr = NULL; \
\
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\
+ else \
__memtrack_addr = kmalloc(sz, flgs); \
- if ( !ZERO_OR_NULL_PTR(__memtrack_addr)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \
+ if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
+ if (memtrack_randomize_mem()) \
+ get_random_bytes(__memtrack_addr, sz); \
} \
__memtrack_addr; \
})
#else
#define kmalloc(sz, flgs) ({ \
- void *__memtrack_addr; \
+ void *__memtrack_addr = NULL; \
\
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\
+ else \
__memtrack_addr = kmalloc(sz, flgs); \
- if ( __memtrack_addr ) { \
- memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
+ if (memtrack_randomize_mem()) \
+ get_random_bytes(__memtrack_addr, sz); \
} \
__memtrack_addr; \
})
#endif
+#define kmalloc_node(sz, flgs, node) ({ \
+ void *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc_node"); \
+ else \
+ __memtrack_addr = kmalloc_node(sz, flgs, node); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
+ if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \
+ get_random_bytes(__memtrack_addr, sz); \
+ } \
+ __memtrack_addr; \
+})
+
+#ifdef ZERO_OR_NULL_PTR
+#define kmemdup(src, sz, flgs) ({ \
+ void *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\
+ else \
+ __memtrack_addr = kmemdup(src, sz, flgs); \
+ if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
+ } \
+ __memtrack_addr; \
+})
+#else
+#define kmemdup(src, sz, flgs) ({ \
+ void *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\
+ else \
+ __memtrack_addr = kmemdup(src, sz, flgs); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
+ } \
+ __memtrack_addr; \
+})
+#endif
+
#ifdef ZERO_OR_NULL_PTR
#define kfree(addr) ({ \
void *__memtrack_addr = (void *)addr; \
- if ( !ZERO_OR_NULL_PTR(__memtrack_addr) ) { \
- memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ \
+ if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \
+ !is_non_trackable_free_func(__func__)) { \
+ memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
} \
kfree(__memtrack_addr); \
})
#else
#define kfree(addr) ({ \
void *__memtrack_addr = (void *)addr; \
- if ( __memtrack_addr ) { \
- memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ \
+ if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \
+ memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
} \
kfree(__memtrack_addr); \
})
#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) || defined (CONFIG_COMPAT_RCU)
+#ifdef kfree_rcu
+ #undef kfree_rcu
+#endif
-
-
-
+#ifdef ZERO_OR_NULL_PTR
+#define kfree_rcu(addr, rcu_head) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \
+ !is_non_trackable_free_func(__func__)) { \
+ memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \
+})
+#else
+#define kfree_rcu(addr, rcu_head) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \
+ memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \
+})
+#endif
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) */
#define vmalloc(size) ({ \
- void *__memtrack_addr; \
+ void *__memtrack_addr = NULL; \
\
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc");\
+ else \
__memtrack_addr = vmalloc(size); \
- if ( __memtrack_addr ) { \
- memtrack_alloc(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, GFP_ATOMIC); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ if (memtrack_randomize_mem()) \
+ get_random_bytes(__memtrack_addr, size); \
+ } \
+ __memtrack_addr; \
+})
+
+#ifndef vzalloc
+#define vzalloc(size) ({ \
+ void *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc");\
+ else \
+ __memtrack_addr = vzalloc(size); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
} \
__memtrack_addr; \
})
+#endif
+
+#ifndef vzalloc_node
+#define vzalloc_node(size, node) ({ \
+ void *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc_node"); \
+ else \
+ __memtrack_addr = vzalloc_node(size, node); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ if (memtrack_randomize_mem()) \
+ get_random_bytes(__memtrack_addr, size); \
+ } \
+ __memtrack_addr; \
+})
+#endif
+#define vmalloc_node(size, node) ({ \
+ void *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc_node"); \
+ else \
+ __memtrack_addr = vmalloc_node(size, node); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ if (memtrack_randomize_mem()) \
+ get_random_bytes(__memtrack_addr, size); \
+ } \
+ __memtrack_addr; \
+})
#define vfree(addr) ({ \
void *__memtrack_addr = (void *)addr; \
- if ( __memtrack_addr ) { \
- memtrack_free(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ if (__memtrack_addr) { \
+ memtrack_free(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
} \
vfree(__memtrack_addr); \
})
#define kmem_cache_alloc(cache, flags) ({ \
- void *__memtrack_addr; \
+ void *__memtrack_addr = NULL; \
\
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmem_cache_alloc"); \
+ else \
__memtrack_addr = kmem_cache_alloc(cache, flags); \
- if ( __memtrack_addr ) { \
- memtrack_alloc(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__, flags); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 1, 0UL, 0, __FILE__, __LINE__, flags); \
} \
__memtrack_addr; \
})
@@ -127,12 +297,548 @@
#define kmem_cache_free(cache, addr) ({ \
void *__memtrack_addr = (void *)addr; \
- if ( __memtrack_addr ) { \
- memtrack_free(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ \
+ if (__memtrack_addr) { \
+ memtrack_free(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
} \
kmem_cache_free(cache, __memtrack_addr); \
})
+/* All IO-MAP handling */
+#define ioremap(phys_addr, size) ({ \
+ void __iomem *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap");\
+ else \
+ __memtrack_addr = ioremap(phys_addr, size); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ __memtrack_addr; \
+})
+
+#define io_mapping_create_wc(base, size) ({ \
+ void __iomem *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "io_mapping_create_wc"); \
+ else \
+ __memtrack_addr = io_mapping_create_wc(base, size); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ __memtrack_addr; \
+})
+
+#define io_mapping_free(addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (__memtrack_addr) { \
+ memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ io_mapping_free(__memtrack_addr); \
+})
+
+#ifdef CONFIG_PPC
+#ifdef ioremap_nocache
+ #undef ioremap_nocache
+#endif
+#define ioremap_nocache(phys_addr, size) ({ \
+ void __iomem *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \
+ else \
+ __memtrack_addr = ioremap(phys_addr, size); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ __memtrack_addr; \
+})
+#else
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 18) /* 2.6.16 - 2.6.17 */
+#ifdef ioremap_nocache
+ #undef ioremap_nocache
+#endif
+#define ioremap_nocache(phys_addr, size) ({ \
+ void __iomem *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \
+ else \
+ __memtrack_addr = ioremap(phys_addr, size); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ __memtrack_addr; \
+})
+#else
+#define ioremap_nocache(phys_addr, size) ({ \
+ void __iomem *__memtrack_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \
+ else \
+ __memtrack_addr = ioremap_nocache(phys_addr, size); \
+ if (__memtrack_addr) { \
+ memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ __memtrack_addr; \
+})
+#endif /* Kernel version is under 2.6.18 */
+#endif /* PPC */
+
+#define iounmap(addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (__memtrack_addr) { \
+ memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ iounmap(__memtrack_addr); \
+})
+
+
+/* All Page handlers */
+/* TODO: Catch netif_rx for page dereference */
+#define alloc_pages_node(nid, gfp_mask, order) ({ \
+ struct page *page_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages_node"); \
+ else \
+ page_addr = (struct page *)alloc_pages_node(nid, gfp_mask, order); \
+ if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
+ memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ page_addr; \
+})
+
+#ifdef CONFIG_NUMA
+#define alloc_pages(gfp_mask, order) ({ \
+ struct page *page_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages"); \
+ else \
+ page_addr = (struct page *)alloc_pages(gfp_mask, order); \
+ if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
+ memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ page_addr; \
+})
+#else
+#ifdef alloc_pages
+ #undef alloc_pages
+#endif
+#define alloc_pages(gfp_mask, order) ({ \
+ struct page *page_addr; \
+ \
+ page_addr = (struct page *)alloc_pages_node(numa_node_id(), gfp_mask, order); \
+ page_addr; \
+})
+#endif
+
+#define __get_free_pages(gfp_mask, order) ({ \
+ struct page *page_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "__get_free_pages"); \
+ else \
+ page_addr = (struct page *)__get_free_pages(gfp_mask, order); \
+ if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
+ memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ page_addr; \
+})
+
+#define get_zeroed_page(gfp_mask) ({ \
+ struct page *page_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_zeroed_page"); \
+ else \
+ page_addr = (struct page *)get_zeroed_page(gfp_mask); \
+ if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
+ memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ (unsigned long)page_addr; \
+})
+
+#define __free_pages(addr, order) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
+ if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \
+ memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ __free_pages(addr, order); \
+})
+
+
+#define free_pages(addr, order) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
+ if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \
+ memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ free_pages(addr, order); \
+})
+
+
+#define get_page(addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
+ if (memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 0, __FILE__, __LINE__)) { \
+ memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ } \
+ get_page(addr); \
+})
+
+#define get_user_pages_fast(start, nr_pages, write, pages) ({ \
+ int __memtrack_rc = -1; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_user_pages_fast"); \
+ else \
+ __memtrack_rc = get_user_pages_fast(start, nr_pages, write, pages); \
+ if (__memtrack_rc > 0 && !is_non_trackable_alloc_func(__func__)) { \
+ int __memtrack_i; \
+ \
+ for (__memtrack_i = 0; __memtrack_i < __memtrack_rc; __memtrack_i++) \
+ memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(pages[__memtrack_i]), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ __memtrack_rc; \
+})
+
+#define put_page(addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ \
+ if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
+ /* Check whether this is not part of umem put page & not */\
+ /* a new addr and the ref-count is 1 then we'll free this addr */\
+ /* Don't change the order these conditions */ \
+ if (!is_umem_put_page(__func__) && \
+ !memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__) && \
+ (memtrack_get_page_ref_count((unsigned long)(__memtrack_addr)) == 1)) { \
+ memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ } \
+ put_page(addr); \
+})
+
+
+/* Work-Queue handlers */
+#ifdef create_workqueue
+ #undef create_workqueue
+#endif
+#ifdef create_rt_workqueue
+ #undef create_rt_workqueue
+#endif
+#ifdef create_freezeable_workqueue
+ #undef create_freezeable_workqueue
+#endif
+#ifdef create_singlethread_workqueue
+ #undef create_singlethread_workqueue
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20) /* 2.6.18 - 2.6.19 */
+#define create_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 0); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#define create_singlethread_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 1); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) /* 2.6.20 - 2.6.27 */
+#define create_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 0, 0); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 22) /* 2.6.20 - 2.6.21 */
+#define create_freezeable_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 0, 1); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+#else /* 2.6.22 - 2.6.27 */
+#define create_freezeable_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 1, 1); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+#endif /* 2.6.20 - 2.6.27 */
+
+#define create_singlethread_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 1, 0); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36) /* 2.6.28 - 2.6.35 */
+
+#ifdef alloc_workqueue
+ #undef alloc_workqueue
+#endif
+
+#define alloc_workqueue(name, flags, max_active) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), (flags), (max_active), 0); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#define create_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 0, 0, 0); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#define create_rt_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_rt_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 0, 0, 1); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#define create_freezeable_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 1, 1, 0); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+
+#define create_singlethread_workqueue(name) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \
+ else \
+ wq_addr = __create_workqueue((name), 1, 0, 0); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+#else /* 2.6.36 */
+#ifdef alloc_workqueue
+ #undef alloc_workqueue
+#endif
+#ifdef CONFIG_LOCKDEP
+#define alloc_workqueue(name, flags, max_active) \
+({ \
+ static struct lock_class_key __key; \
+ const char *__lock_name; \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (__builtin_constant_p(name)) \
+ __lock_name = (name); \
+ else \
+ __lock_name = #name; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \
+ else \
+ wq_addr = __alloc_workqueue_key((name), (flags), (max_active), \
+ &__key, __lock_name); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+#else
+#define alloc_workqueue(name, flags, max_active) ({ \
+ struct workqueue_struct *wq_addr = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \
+ else \
+ wq_addr = __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL); \
+ if (wq_addr) { \
+ memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ wq_addr; \
+})
+#endif
+
+#define create_workqueue(name) \
+ alloc_workqueue((name), WQ_RESCUER, 1);
+
+#define create_freezeable_workqueue(name) \
+ alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1);
+
+#define create_singlethread_workqueue(name) \
+ alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1);
+
+#endif /* Work-Queue Kernel Versions */
+
+#define destroy_workqueue(wq_addr) ({ \
+ void *__memtrack_addr = (void *)wq_addr; \
+ \
+ if (__memtrack_addr) { \
+ memtrack_free(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
+ } \
+ destroy_workqueue(wq_addr); \
+})
+
+/* ONLY error injection to functions that we don't monitor */
+#define alloc_skb(size, prio) ({ \
+ struct sk_buff *__memtrack_skb = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb"); \
+ else \
+ __memtrack_skb = alloc_skb(size, prio); \
+ __memtrack_skb; \
+})
+
+#define dev_alloc_skb(size) ({ \
+ struct sk_buff *__memtrack_skb = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "dev_alloc_skb"); \
+ else \
+ __memtrack_skb = dev_alloc_skb(size); \
+ __memtrack_skb; \
+})
+
+#define alloc_skb_fclone(size, prio) ({ \
+ struct sk_buff *__memtrack_skb = NULL; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb_fclone"); \
+ else \
+ __memtrack_skb = alloc_skb_fclone(size, prio); \
+ __memtrack_skb; \
+})
+
+#define copy_from_user(to, from, n) ({ \
+ int ret = n; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_from_user"); \
+ else \
+ ret = copy_from_user(to, from, n); \
+ ret; \
+})
+
+#define copy_to_user(to, from, n) ({ \
+ int ret = n; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_to_user"); \
+ else \
+ ret = copy_to_user(to, from, n); \
+ ret; \
+})
+
+#define sysfs_create_file(kobj, attr) ({ \
+ int ret = -ENOSYS; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_file"); \
+ else \
+ ret = sysfs_create_file(kobj, attr); \
+ ret; \
+})
+
+#define sysfs_create_link(kobj, target, name) ({ \
+ int ret = -ENOSYS; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_link"); \
+ else \
+ ret = sysfs_create_link(kobj, target, name); \
+ ret; \
+})
+
+#define sysfs_create_group(kobj, grp) ({ \
+ int ret = -ENOSYS; \
+ \
+ if (memtrack_inject_error()) \
+ MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_group"); \
+ else \
+ ret = sysfs_create_group(kobj, grp); \
+ ret; \
+})
+
#endif /* __mtrack_h_ */
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
deleted file mode 100644
index 7b81da0..0000000
--- a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-# $FreeBSD$
-#.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4
-#.PATH: ${.CURDIR}/../../../../include/linux
-
-.include <src.opts.mk>
-
-KMOD = mlx4ib
-SRCS = device_if.h bus_if.h pci_if.h vnode_if.h
-#SRCS+= linux_compat.c linux_radix.c
-SRCS+= ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c
-SRCS+= opt_inet.h opt_inet6.h
-
-#CFLAGS+= -I${.CURDIR}/../../ofed/include/
-CFLAGS+= -I${.CURDIR}/../../../../include
-CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM
-
-.if !defined(KERNBUILDDIR)
-.if ${MK_INET_SUPPORT} != "no"
-opt_inet.h:
- @echo "#define INET 1" > ${.TARGET}
-.endif
-
-.if ${MK_INET6_SUPPORT} != "no"
-opt_inet6.h:
- @echo "#define INET6 1" > ${.TARGET}
-.endif
-.endif
-
-.include <bsd.kmod.mk>
-
-CFLAGS+= -Wno-cast-qual -Wno-pointer-arith ${GCC_MS_EXTENSIONS}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
index fe35e62..1c30fa9 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -95,21 +94,18 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
{
struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
struct mlx4_dev *dev = ibdev->dev;
- union ib_gid sgid;
- u8 mac[6];
- int err;
- int is_mcast;
+ int is_mcast = 0;
+ struct in6_addr in6;
u16 vlan_tag;
- err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num);
- if (err)
- return ERR_PTR(err);
-
- memcpy(ah->av.eth.mac, mac, 6);
- err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid);
- if (err)
- return ERR_PTR(err);
- vlan_tag = rdma_get_vlan_id(&sgid);
+ memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6));
+ if (rdma_is_multicast_addr(&in6)) {
+ is_mcast = 1;
+ resolve_mcast_mac(&in6, ah->av.eth.mac);
+ } else {
+ memcpy(ah->av.eth.mac, ah_attr->dmac, 6);
+ }
+ vlan_tag = ah_attr->vlan_id;
if (vlan_tag < 0x1000)
vlan_tag |= (ah_attr->sl & 7) << 13;
ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c b/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
index 0738adc..17e646a 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -57,6 +57,7 @@ struct mlx4_alias_guid_work_context {
int query_id;
struct list_head list;
int block_num;
+ u8 method;
};
struct mlx4_next_alias_guid_work {
@@ -80,7 +81,8 @@ void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
ports_guid[port_num - 1].
all_rec_per_port[block_num].guid_indexes);
- pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, (long long)guid_indexes);
+ pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num,
+ (unsigned long long)guid_indexes);
for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
/* The location of the specific index starts from bit number 4
@@ -144,7 +146,8 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
ports_guid[port_num - 1].
all_rec_per_port[block_num].guid_indexes);
- pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, (long long)guid_indexes);
+ pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num,
+ (unsigned long long)guid_indexes);
/*calculate the slaves and notify them*/
for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
@@ -201,7 +204,7 @@ static void aliasguid_query_handler(int status,
{
struct mlx4_ib_dev *dev;
struct mlx4_alias_guid_work_context *cb_ctx = context;
- u8 port_index ;
+ u8 port_index;
int i;
struct mlx4_sriov_alias_guid_info_rec_det *rec;
unsigned long flags, flags1;
@@ -240,6 +243,18 @@ static void aliasguid_query_handler(int status,
for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
__be64 tmp_cur_ag;
tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE];
+ if ((cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE)
+ && (MLX4_NOT_SET_GUID == tmp_cur_ag)) {
+ pr_debug("%s:Record num %d in block_num:%d "
+ "was deleted by SM,ownership by %d "
+ "(0 = driver, 1=sysAdmin, 2=None)\n",
+ __func__, i, guid_rec->block_num,
+ rec->ownership);
+ rec->guid_indexes = rec->guid_indexes &
+ ~mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ continue;
+ }
+
/* check if the SM didn't assign one of the records.
* if it didn't, if it was not sysadmin request:
* ask the SM to give a new GUID, (instead of the driver request).
@@ -379,7 +394,7 @@ static int set_guid_rec(struct ib_device *ibdev,
callback_context->port = port;
callback_context->dev = dev;
callback_context->block_num = index;
-
+ callback_context->method = rec_det->method;
memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
guid_info_rec.lid = cpu_to_be16(attr.lid);
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cm.c b/sys/ofed/drivers/infiniband/hw/mlx4/cm.c
index 1bfbeee..3ff7600 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/cm.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/cm.c
@@ -33,6 +33,7 @@
#include <rdma/ib_mad.h>
#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
#include <linux/idr.h>
#include <rdma/ib_cm.h>
@@ -60,6 +61,11 @@ struct cm_generic_msg {
__be32 remote_comm_id;
};
+struct cm_sidr_generic_msg {
+ struct ib_mad_hdr hdr;
+ __be32 request_id;
+};
+
struct cm_req_msg {
unsigned char unused[0x60];
union ib_gid primary_path_sgid;
@@ -68,28 +74,62 @@ struct cm_req_msg {
static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ msg->request_id = cpu_to_be32(cm_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ pr_err("trying to set local_comm_id in SIDR_REP\n");
+ return;
+ } else {
struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
msg->local_comm_id = cpu_to_be32(cm_id);
+ }
}
static u32 get_local_comm_id(struct ib_mad *mad)
{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ return be32_to_cpu(msg->request_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ pr_err("trying to set local_comm_id in SIDR_REP\n");
+ return -1;
+ } else {
struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
-
return be32_to_cpu(msg->local_comm_id);
+ }
}
static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ msg->request_id = cpu_to_be32(cm_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+ return;
+ } else {
struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
msg->remote_comm_id = cpu_to_be32(cm_id);
+ }
}
static u32 get_remote_comm_id(struct ib_mad *mad)
{
+ if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ struct cm_sidr_generic_msg *msg =
+ (struct cm_sidr_generic_msg *)mad;
+ return be32_to_cpu(msg->request_id);
+ } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+ pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+ return -1;
+ } else {
struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
-
return be32_to_cpu(msg->remote_comm_id);
+ }
}
static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad)
@@ -285,19 +325,22 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
u32 sl_cm_id;
int pv_cm_id = -1;
- sl_cm_id = get_local_comm_id(mad);
-
if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
- mad->mad_hdr.attr_id == CM_REP_ATTR_ID) {
+ mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ sl_cm_id = get_local_comm_id(mad);
id = id_map_alloc(ibdev, slave_id, sl_cm_id);
if (IS_ERR(id)) {
mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",
__func__, slave_id, sl_cm_id);
return PTR_ERR(id);
}
- } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) {
+ } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
return 0;
} else {
+ sl_cm_id = get_local_comm_id(mad);
id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
}
@@ -323,7 +366,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
u32 pv_cm_id;
struct id_map_entry *id;
- if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) {
+ if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
union ib_gid gid;
if (is_eth)
@@ -333,7 +377,7 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
*slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);
if (*slave < 0) {
mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n",
- (long long)gid.global.interface_id);
+ (unsigned long long)gid.global.interface_id);
return -ENOENT;
}
return 0;
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
index 293917a..52788c2 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
@@ -33,6 +33,7 @@
#include <linux/mlx4/cq.h>
#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
#include <linux/slab.h>
#include "mlx4_ib.h"
@@ -92,12 +93,33 @@ static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
return get_sw_cqe(cq, cq->mcq.cons_index);
}
-int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+int mlx4_ib_modify_cq(struct ib_cq *cq,
+ struct ib_cq_attr *cq_attr,
+ int cq_attr_mask)
{
+ int err = 0;
struct mlx4_ib_cq *mcq = to_mcq(cq);
struct mlx4_ib_dev *dev = to_mdev(cq->device);
- return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period);
+ if (cq_attr_mask & IB_CQ_CAP_FLAGS) {
+ if (cq_attr->cq_cap_flags & IB_CQ_TIMESTAMP)
+ return -ENOTSUPP;
+
+ if (cq_attr->cq_cap_flags & IB_CQ_IGNORE_OVERRUN) {
+ if (dev->dev->caps.cq_flags & MLX4_DEV_CAP_CQ_FLAG_IO)
+ err = mlx4_cq_ignore_overrun(dev->dev, &mcq->mcq);
+ else
+ err = -ENOSYS;
+ }
+ }
+
+ if (!err)
+ if (cq_attr_mask & IB_CQ_MODERATION)
+ err = mlx4_cq_modify(dev->dev, &mcq->mcq,
+ cq_attr->moderation.cq_count,
+ cq_attr->moderation.cq_period);
+
+ return err;
}
static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent)
@@ -173,7 +195,11 @@ err_buf:
return err;
}
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+/* we don't support system timestamping */
+#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_TIMESTAMP
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
+ struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
struct ib_udata *udata)
{
@@ -181,11 +207,16 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
struct mlx4_ib_cq *cq;
struct mlx4_uar *uar;
int err;
+ int entries = attr->cqe;
+ int vector = attr->comp_vector;
if (entries < 1 || entries > dev->dev->caps.max_cqes)
return ERR_PTR(-EINVAL);
- cq = kmalloc(sizeof *cq, GFP_KERNEL);
+ if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
+ return ERR_PTR(-EINVAL);
+
+ cq = kzalloc(sizeof(*cq), GFP_KERNEL);
if (!cq)
return ERR_PTR(-ENOMEM);
@@ -195,6 +226,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
spin_lock_init(&cq->lock);
cq->resize_buf = NULL;
cq->resize_umem = NULL;
+ cq->create_flags = attr->flags;
if (context) {
struct mlx4_ib_create_cq ucmd;
@@ -236,7 +268,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
vector = dev->eq_table[vector % ibdev->num_comp_vectors];
err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
- cq->db.dma, &cq->mcq, vector, 0, 0);
+ cq->db.dma, &cq->mcq, vector, 0,
+ !!(cq->create_flags & IB_CQ_TIMESTAMP));
if (err)
goto err_dbmap;
@@ -331,21 +364,23 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
u32 i;
i = cq->mcq.cons_index;
- while (get_sw_cqe(cq, i & cq->ibcq.cqe))
+ while (get_sw_cqe(cq, i))
++i;
return i - cq->mcq.cons_index;
}
-static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+static int mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
{
struct mlx4_cqe *cqe, *new_cqe;
int i;
int cqe_size = cq->buf.entry_size;
int cqe_inc = cqe_size == 64 ? 1 : 0;
+ struct mlx4_cqe *start_cqe;
i = cq->mcq.cons_index;
cqe = get_cqe(cq, i & cq->ibcq.cqe);
+ start_cqe = cqe;
cqe += cqe_inc;
while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
@@ -357,9 +392,15 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+ if (cqe == start_cqe) {
+ pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", cq->mcq.cqn);
+ return -ENOMEM;
+ }
cqe += cqe_inc;
+
}
++cq->mcq.cons_index;
+ return 0;
}
int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
@@ -374,7 +415,6 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
return -ENOSYS;
mutex_lock(&cq->resize_mutex);
-
if (entries < 1 || entries > dev->dev->caps.max_cqes) {
err = -EINVAL;
goto out;
@@ -386,6 +426,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
goto out;
}
+ if (entries > dev->dev->caps.max_cqes + 1) {
+ err = -EINVAL;
+ goto out;
+ }
+
if (ibcq->uobject) {
err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
if (err)
@@ -425,7 +470,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
spin_lock_irq(&cq->lock);
if (cq->resize_buf) {
- mlx4_ib_cq_resize_copy_cqes(cq);
+ err = mlx4_ib_cq_resize_copy_cqes(cq);
tmp_buf = cq->buf;
tmp_cqe = cq->ibcq.cqe;
cq->buf = cq->resize_buf->buf;
@@ -580,7 +625,7 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
}
static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
- unsigned tail, struct mlx4_cqe *cqe)
+ unsigned tail, struct mlx4_cqe *cqe, int is_eth)
{
struct mlx4_ib_proxy_sqp_hdr *hdr;
@@ -590,12 +635,19 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct
DMA_FROM_DEVICE);
hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index);
- wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32);
- wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;
wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
wc->dlid_path_bits = 0;
+ if (is_eth) {
+ wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid);
+ memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4);
+ memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2);
+ } else {
+ wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32);
+ wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+ }
+
return 0;
}
@@ -607,11 +659,14 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
struct mlx4_qp *mqp;
struct mlx4_ib_wq *wq;
struct mlx4_ib_srq *srq;
+ struct mlx4_srq *msrq = NULL;
int is_send;
int is_error;
u32 g_mlpath_rqpn;
u16 wqe_ctr;
unsigned tail = 0;
+ int timestamp_en = !!(cq->create_flags & IB_CQ_TIMESTAMP);
+
repoll:
cqe = next_cqe_sw(cq);
@@ -675,6 +730,20 @@ repoll:
wc->qp = &(*cur_qp)->ibqp;
+ if (wc->qp->qp_type == IB_QPT_XRC_TGT) {
+ u32 srq_num;
+ g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+ srq_num = g_mlpath_rqpn & 0xffffff;
+ /* SRQ is also in the radix tree */
+ msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+ srq_num);
+ if (unlikely(!msrq)) {
+ pr_warn("CQ %06x with entry for unknown SRQN %06x\n",
+ cq->mcq.cqn, srq_num);
+ return -EINVAL;
+ }
+ }
+
if (is_send) {
wq = &(*cur_qp)->sq;
if (!(*cur_qp)->sq_signal_bits) {
@@ -688,6 +757,11 @@ repoll:
wqe_ctr = be16_to_cpu(cqe->wqe_index);
wc->wr_id = srq->wrid[wqe_ctr];
mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+ } else if (msrq) {
+ srq = to_mibsrq(msrq);
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wc->wr_id = srq->wrid[wqe_ctr];
+ mlx4_ib_free_srq_wqe(srq, wqe_ctr);
} else {
wq = &(*cur_qp)->rq;
tail = wq->tail & (wq->wqe_cnt - 1);
@@ -707,6 +781,7 @@ repoll:
switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
case MLX4_OPCODE_RDMA_WRITE_IMM:
wc->wc_flags |= IB_WC_WITH_IMM;
+ /* fall through */
case MLX4_OPCODE_RDMA_WRITE:
wc->opcode = IB_WC_RDMA_WRITE;
break;
@@ -778,10 +853,31 @@ repoll:
if ((*cur_qp)->mlx4_ib_qp_type &
(MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
- return use_tunnel_data(*cur_qp, cq, wc, tail, cqe);
+ return use_tunnel_data
+ (*cur_qp, cq, wc, tail, cqe,
+ rdma_port_get_link_layer
+ (wc->qp->device,
+ (*cur_qp)->port) ==
+ IB_LINK_LAYER_ETHERNET);
}
+ if (timestamp_en) {
+ /* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is
+ * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't
+ * supported */
+ if (cq->create_flags & IB_CQ_TIMESTAMP_TO_SYS_TIME) {
+ wc->ts.timestamp = 0;
+ } else {
+ wc->ts.timestamp =
+ ((u64)(be32_to_cpu(cqe->timestamp_16_47)
+ + !cqe->timestamp_0_15) << 16)
+ | be16_to_cpu(cqe->timestamp_0_15);
+ wc->wc_flags |= IB_WC_WITH_TIMESTAMP;
+ }
+ } else {
+ wc->wc_flags |= IB_WC_WITH_SLID;
wc->slid = be16_to_cpu(cqe->rlid);
+ }
g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
wc->src_qp = g_mlpath_rqpn & 0xffffff;
wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
@@ -789,11 +885,27 @@ repoll:
wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status,
cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
+ if (!timestamp_en) {
if (rdma_port_get_link_layer(wc->qp->device,
- (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET)
+ (*cur_qp)->port) ==
+ IB_LINK_LAYER_ETHERNET)
wc->sl = be16_to_cpu(cqe->sl_vid) >> 13;
else
wc->sl = be16_to_cpu(cqe->sl_vid) >> 12;
+ wc->wc_flags |= IB_WC_WITH_SL;
+ }
+ if ((be32_to_cpu(cqe->vlan_my_qpn) &
+ MLX4_CQE_VLAN_PRESENT_MASK) && !timestamp_en) {
+ wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
+ MLX4_CQE_VID_MASK;
+ wc->wc_flags |= IB_WC_WITH_VLAN;
+ } else {
+ wc->vlan_id = 0xffff;
+ }
+ if (!timestamp_en) {
+ memcpy(wc->smac, cqe->smac, 6);
+ wc->wc_flags |= IB_WC_WITH_SMAC;
+ }
}
return 0;
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
index 8aee423..c517409 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
@@ -45,7 +45,6 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
struct mlx4_db *db)
{
struct mlx4_ib_user_db_page *page;
- struct ib_umem_chunk *chunk;
int err = 0;
mutex_lock(&context->db_page_mutex);
@@ -73,8 +72,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
list_add(&page->list, &context->db_page_list);
found:
- chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list);
- db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK);
+ db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);
db->u.user_page = page;
++page->refcnt;
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
index 74bbf5c..bd36931 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
@@ -545,11 +545,32 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
/* adjust tunnel data */
tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix);
- tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
- tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);
tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);
tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0;
+ if (is_eth) {
+ u16 vlan = 0;
+ if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan,
+ NULL)) {
+ if (vlan != wc->vlan_id)
+ /* VST and default vlan is not the packet vlan drop the
+ * packet*/
+ goto out;
+ else
+ /* VST , remove hide the vlan from the VF */
+ vlan = 0;
+ } else {
+ vlan = wc->vlan_id;
+ }
+
+ tun_mad->hdr.sl_vid = cpu_to_be16(vlan);
+ memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4);
+ memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2);
+ } else {
+ tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
+ tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);
+ }
+
ib_dma_sync_single_for_device(&dev->ib_dev,
tun_qp->tx_ring[tun_tx_ix].buf.map,
sizeof (struct mlx4_rcv_tunnel_mad),
@@ -696,12 +717,11 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
be16_to_cpu(in_mad->mad_hdr.attr_id));
if (in_wc->wc_flags & IB_WC_GRH) {
pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
- (long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix),
- (long long)
- be64_to_cpu(in_grh->sgid.global.interface_id));
+ (unsigned long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix),
+ (unsigned long long)be64_to_cpu(in_grh->sgid.global.interface_id));
pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n",
- (long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix),
- (long long)be64_to_cpu(in_grh->dgid.global.interface_id));
+ (unsigned long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix),
+ (unsigned long long)be64_to_cpu(in_grh->dgid.global.interface_id));
}
}
@@ -946,7 +966,7 @@ int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index,
err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
- MLX4_CMD_WRAPPED);
+ MLX4_CMD_NATIVE);
if (!err)
memcpy(counter, mailbox->buf, MLX4_IF_STAT_SZ(1));
@@ -961,7 +981,7 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
{
struct mlx4_ib_dev *dev = to_mdev(ibdev);
int err;
- u32 counter_index = dev->counters[port_num - 1] & 0xffff;
+ u32 counter_index = dev->counters[port_num - 1].counter_index & 0xffff;
u8 mode;
char counter_buf[MLX4_IF_STAT_SZ(1)];
union mlx4_counter *counter = (union mlx4_counter *)
@@ -970,10 +990,16 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
return -EINVAL;
- if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0)) {
- err = IB_MAD_RESULT_FAILURE;
- } else {
+ /* in case of default counter IB shares the counter with ETH */
+ /* the state could be -EEXIST or -ENOSPC */
+ if (dev->counters[port_num - 1].status) {
memset(out_mad->data, 0, sizeof out_mad->data);
+ err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+ } else {
+ if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0))
+ return IB_MAD_RESULT_FAILURE;
+
+ memset(out_mad->data, 0, sizeof(out_mad->data));
mode = counter->control.cnt_mode & 0xFF;
err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
switch (mode & 0xf) {
@@ -992,7 +1018,6 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
}
}
-
return err;
}
@@ -1179,6 +1204,11 @@ void handle_port_mgmt_change_event(struct work_struct *work)
u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
update_sm_ah(dev, port, lid, sl);
+ mlx4_ib_dispatch_event(dev, port, IB_EVENT_SM_CHANGE);
+ if (mlx4_is_master(dev->dev))
+ mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
+ changed_attr & MSTR_SM_CHANGE_MASK,
+ lid, sl);
}
/* Check if it is a lid change event */
@@ -1295,8 +1325,9 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
- enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
- u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad)
+ enum ib_qp_type dest_qpt, u16 pkey_index,
+ u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr,
+ u8 *s_mac, struct ib_mad *mad)
{
struct ib_sge list;
struct ib_send_wr wr, *bad_wr;
@@ -1385,6 +1416,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
wr.num_sge = 1;
wr.opcode = IB_WR_SEND;
wr.send_flags = IB_SEND_SIGNALED;
+ if (s_mac)
+ memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6);
+
ret = ib_post_send(send_qp, &wr, &bad_wr);
out:
@@ -1512,6 +1546,11 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
if (ah_attr.ah_flags & IB_AH_GRH)
if (get_real_sgid_index(dev, slave, ctx->port, &ah_attr))
return;
+ memcpy(ah_attr.dmac, tunnel->hdr.mac, 6);
+ ah_attr.vlan_id = tunnel->hdr.vlan;
+ /* if slave have default vlan use it */
+ mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave,
+ &ah_attr.vlan_id, &ah_attr.sl);
mlx4_ib_send_to_wire(dev, slave, ctx->port,
is_proxy_qp0(dev, wc->src_qp, slave) ?
@@ -1519,7 +1558,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
be16_to_cpu(tunnel->hdr.pkey_index),
be32_to_cpu(tunnel->hdr.remote_qpn),
be32_to_cpu(tunnel->hdr.qkey),
- &ah_attr, &tunnel->mad);
+ &ah_attr, wc->smac, &tunnel->mad);
}
static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
@@ -1564,6 +1603,12 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
tun_qp->ring[i].addr,
rx_buf_size,
DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ctx->ib_dev,
+ tun_qp->ring[i].map))) {
+ mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n");
+ kfree(tun_qp->ring[i].addr);
+ goto err;
+ }
}
for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
@@ -1576,6 +1621,12 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
tun_qp->tx_ring[i].buf.addr,
tx_buf_size,
DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ctx->ib_dev,
+ tun_qp->tx_ring[i].buf.map))) {
+ mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n");
+ kfree(tun_qp->tx_ring[i].buf.addr);
+ goto tx_err;
+ }
tun_qp->tx_ring[i].ah = NULL;
}
spin_lock_init(&tun_qp->tx_lock);
@@ -1664,12 +1715,12 @@ static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
(MLX4_NUM_TUNNEL_BUFS - 1));
if (ret)
pr_err("Failed reposting tunnel "
- "buf:%lld\n", (long long)wc.wr_id);
+ "buf:%lld\n", (unsigned long long)wc.wr_id);
break;
case IB_WC_SEND:
pr_debug("received tunnel send completion:"
"wrid=0x%llx, status=0x%x\n",
- (long long)wc.wr_id, wc.status);
+ (unsigned long long)wc.wr_id, wc.status);
ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
(MLX4_NUM_TUNNEL_BUFS - 1)].ah);
tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
@@ -1685,7 +1736,7 @@ static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
} else {
pr_debug("mlx4_ib: completion error in tunnel: %d."
" status = %d, wrid = 0x%llx\n",
- ctx->slave, wc.status, (long long)wc.wr_id);
+ ctx->slave, wc.status, (unsigned long long)wc.wr_id);
if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
(MLX4_NUM_TUNNEL_BUFS - 1)].ah);
@@ -1757,6 +1808,11 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
memset(&attr, 0, sizeof attr);
attr.qp_state = IB_QPS_INIT;
+ ret = 0;
+ if (create_tun)
+ ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave,
+ ctx->port, 0xFFFF, &attr.pkey_index);
+ if (ret || !create_tun)
attr.pkey_index =
to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
attr.qkey = IB_QP1_QKEY;
@@ -1837,7 +1893,7 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
(MLX4_NUM_TUNNEL_BUFS - 1)))
pr_err("Failed reposting SQP "
- "buf:%lld\n", (long long)wc.wr_id);
+ "buf:%lld\n", (unsigned long long)wc.wr_id);
break;
default:
BUG_ON(1);
@@ -1846,7 +1902,7 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
} else {
pr_debug("mlx4_ib: completion error in tunnel: %d."
" status = %d, wrid = 0x%llx\n",
- ctx->slave, wc.status, (long long)wc.wr_id);
+ ctx->slave, wc.status, (unsigned long long)wc.wr_id);
if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
ib_destroy_ah(sqp->tx_ring[wc.wr_id &
(MLX4_NUM_TUNNEL_BUFS - 1)].ah);
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/main.c b/sys/ofed/drivers/infiniband/hw/mlx4/main.c
index fd0b723..bdcffbe 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/main.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/main.c
@@ -32,37 +32,37 @@
*/
#include <linux/module.h>
-
-#ifdef __linux__
-#include <linux/proc_fs.h>
-#endif
-
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/if_vlan.h>
-#include <linux/bitops.h>
-#include <linux/if_ether.h>
#include <linux/fs.h>
+#include <net/ipv6.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_verbs_exp.h>
#include <rdma/ib_addr.h>
#include <linux/mlx4/driver.h>
#include <linux/mlx4/cmd.h>
#include <linux/sched.h>
+#include <linux/page.h>
+#include <linux/printk.h>
#include "mlx4_ib.h"
+#include "mlx4_exp.h"
#include "user.h"
#include "wc.h"
#define DRV_NAME MLX4_IB_DRV_NAME
#define DRV_VERSION "1.0"
-#define DRV_RELDATE "April 4, 2008"
+#define DRV_RELDATE __DATE__
#define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib"
#define MLX4_IB_MRS_PROC_DIR_NAME "mrs"
+#define MLX4_IB_FLOW_MAX_PRIO 0xFFF
+#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
@@ -73,20 +73,30 @@ MODULE_VERSION(DRV_VERSION);
int mlx4_ib_sm_guid_assign = 1;
-#ifdef __linux__
-struct proc_dir_entry *mlx4_mrs_dir_entry;
-static struct proc_dir_entry *mlx4_ib_driver_dir_entry;
-#endif
-
module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)");
-static char dev_assign_str[512];
-//module_param_string(dev_assign_str, dev_assign_str, sizeof(dev_assign_str), 0644);
-MODULE_PARM_DESC(dev_assign_str, "Map all device function numbers to "
- "IB device numbers following the pattern: "
- "bb:dd.f-0,bb:dd.f-1,... (all numbers are hexadecimals)."
- " Max supported devices - 32");
+enum {
+ MAX_NUM_STR_BITMAP = 1 << 15,
+ DEFAULT_TBL_VAL = -1
+};
+
+static struct mlx4_dbdf2val_lst dev_assign_str = {
+ .name = "dev_assign_str param",
+ .num_vals = 1,
+ .def_val = {DEFAULT_TBL_VAL},
+ .range = {0, MAX_NUM_STR_BITMAP - 1}
+};
+module_param_string(dev_assign_str, dev_assign_str.str,
+ sizeof(dev_assign_str.str), 0444);
+MODULE_PARM_DESC(dev_assign_str,
+ "Map device function numbers to IB device numbers (e.g. '0000:04:00.0-0,002b:1c:0b.a-1,...').\n"
+ "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for IB device numbers (e.g. 1).\n"
+ "\t\tMax supported devices - 32");
+
+
+static unsigned long *dev_num_str_bitmap;
+static spinlock_t dev_num_str_lock;
static const char mlx4_ib_version[] =
DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
@@ -106,11 +116,16 @@ struct dev_rec {
int nr;
};
-#define MAX_DR 32
-static struct dev_rec dr[MAX_DR];
+static int dr_active;
static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*,
+ unsigned long);
+
+static u8 mlx4_ib_get_dev_port(struct net_device *dev,
+ struct mlx4_ib_dev *ibdev);
+
static struct workqueue_struct *wq;
static void init_query_mad(struct ib_smp *mad)
@@ -123,7 +138,30 @@ static void init_query_mad(struct ib_smp *mad)
static union ib_gid zgid;
-static int mlx4_ib_query_device(struct ib_device *ibdev,
+static int check_flow_steering_support(struct mlx4_dev *dev)
+{
+ int eth_num_ports = 0;
+ int ib_num_ports = 0;
+ int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED;
+
+ if (dmfs) {
+ int i;
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+ eth_num_ports++;
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ ib_num_ports++;
+ dmfs &= (!ib_num_ports ||
+ (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) &&
+ (!eth_num_ports ||
+ (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN));
+ if (ib_num_ports && mlx4_is_mfunc(dev)) {
+ dmfs = 0;
+ }
+ }
+ return dmfs;
+}
+
+int mlx4_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props)
{
struct mlx4_ib_dev *dev = to_mdev(ibdev);
@@ -174,12 +212,26 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
props->device_cap_flags |= IB_DEVICE_XRC;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)
+ props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
+
+ if (check_flow_steering_support(dev->dev))
+ props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
+
props->device_cap_flags |= IB_DEVICE_QPG;
if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
props->device_cap_flags |= IB_DEVICE_UD_RSS;
props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz;
}
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)
+ props->device_cap_flags |= IB_DEVICE_MEM_WINDOW;
+ if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+ if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B)
+ props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
+ else
+ props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
+ }
props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
0xffffff;
props->vendor_part_id = dev->dev->pdev->device;
@@ -213,6 +265,13 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
props->max_mcast_grp;
props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
+ props->hca_core_clock = dev->dev->caps.hca_core_clock;
+ if (dev->dev->caps.hca_core_clock > 0)
+ props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK;
+ if (dev->dev->caps.cq_timestamp) {
+ props->timestamp_mask = 0xFFFFFFFFFFFF;
+ props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK;
+ }
out:
kfree(in_mad);
@@ -334,6 +393,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
struct net_device *ndev;
enum ib_mtu tmp;
struct mlx4_cmd_mailbox *mailbox;
+ unsigned long flags;
int err = 0;
mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
@@ -362,7 +422,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
props->state = IB_PORT_DOWN;
props->phys_state = state_to_phys_state(props->state);
props->active_mtu = IB_MTU_256;
- spin_lock(&iboe->lock);
+ spin_lock_irqsave(&iboe->lock, flags);
ndev = iboe->netdevs[port - 1];
if (!ndev)
goto out_unlock;
@@ -374,7 +434,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
IB_PORT_ACTIVE : IB_PORT_DOWN;
props->phys_state = state_to_phys_state(props->state);
out_unlock:
- spin_unlock(&iboe->lock);
+ spin_unlock_irqrestore(&iboe->lock, flags);
out:
mlx4_free_cmd_mailbox(mdev->dev, mailbox);
return err;
@@ -674,7 +734,9 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
return 0;
}
-#ifdef __linux__
+
+/* XXX FBSD has no support for get_unmapped_area function */
+#if 0
static unsigned long mlx4_ib_get_unmapped_area(struct file *file,
unsigned long addr,
unsigned long len, unsigned long pgoff,
@@ -732,7 +794,6 @@ full_search:
static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
{
struct mlx4_ib_dev *dev = to_mdev(context->device);
- int err;
/* Last 8 bits hold the command others are data per that command */
unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK;
@@ -758,31 +819,81 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
dev->dev->caps.num_uars,
PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
- } else if (command == MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) {
- /* Getting contiguous physical pages */
- unsigned long total_size = vma->vm_end - vma->vm_start;
- unsigned long page_size_order = (vma->vm_pgoff) >>
- MLX4_IB_MMAP_CMD_BITS;
- struct ib_cmem *ib_cmem;
- ib_cmem = ib_cmem_alloc_contiguous_pages(context, total_size,
- page_size_order);
- if (IS_ERR(ib_cmem)) {
- err = PTR_ERR(ib_cmem);
- return err;
- }
+ } else if (command == MLX4_IB_MMAP_GET_HW_CLOCK) {
+ struct mlx4_clock_params params;
+ int ret;
- err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma);
- if (err) {
- ib_cmem_release_contiguous_pages(ib_cmem);
- return err;
- }
- return 0;
+ ret = mlx4_get_internal_clock_params(dev->dev, &params);
+ if (ret)
+ return ret;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ (pci_resource_start(dev->dev->pdev,
+ params.bar) + params.offset)
+ >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
} else
return -EINVAL;
return 0;
}
+static int mlx4_ib_ioctl(struct ib_ucontext *context, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mlx4_ib_dev *dev = to_mdev(context->device);
+ int ret;
+ int offset;
+
+ switch (cmd) {
+ case MLX4_IOCHWCLOCKOFFSET: {
+ struct mlx4_clock_params params;
+ int ret;
+ ret = mlx4_get_internal_clock_params(dev->dev, &params);
+ if (!ret) {
+ offset = params.offset % PAGE_SIZE;
+ ret = put_user(offset,
+ (int *)arg);
+ return sizeof(int);
+ } else {
+ return ret;
+ }
+ }
+ default: {
+ pr_err("mlx4_ib: invalid ioctl %u command with arg %lX\n",
+ cmd, arg);
+ return -ENOTTY;
+ }
+ }
+
+ return ret;
+}
+
+static int mlx4_ib_query_values(struct ib_device *device, int q_values,
+ struct ib_device_values *values)
+{
+ struct mlx4_ib_dev *dev = to_mdev(device);
+ cycle_t cycles;
+
+ values->values_mask = 0;
+ if (q_values & IBV_VALUES_HW_CLOCK) {
+ cycles = mlx4_read_clock(dev->dev);
+ if (cycles < 0) {
+ values->hwclock = cycles & CORE_CLOCK_MASK;
+ values->values_mask |= IBV_VALUES_HW_CLOCK;
+ }
+ q_values &= ~IBV_VALUES_HW_CLOCK;
+ }
+
+ if (q_values)
+ return -ENOTTY;
+
+ return 0;
+}
+
static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
struct ib_ucontext *context,
struct ib_udata *udata)
@@ -926,258 +1037,220 @@ struct mlx4_ib_steering {
union ib_gid gid;
};
-static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+static int parse_flow_attr(struct mlx4_dev *dev,
+ union ib_flow_spec *ib_spec,
+ struct _rule_hw *mlx4_spec)
{
- int err;
- struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
- struct mlx4_ib_qp *mqp = to_mqp(ibqp);
- u64 reg_id;
- struct mlx4_ib_steering *ib_steering = NULL;
+ enum mlx4_net_trans_rule_id type;
+
+ switch (ib_spec->type) {
+ case IB_FLOW_SPEC_ETH:
+ type = MLX4_NET_TRANS_RULE_ID_ETH;
+ memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac,
+ ETH_ALEN);
+ memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac,
+ ETH_ALEN);
+ mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
+ mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
+ break;
- if (mdev->dev->caps.steering_mode ==
- MLX4_STEERING_MODE_DEVICE_MANAGED) {
- ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
- if (!ib_steering)
- return -ENOMEM;
- }
+ case IB_FLOW_SPEC_IB:
+ type = MLX4_NET_TRANS_RULE_ID_IB;
+ mlx4_spec->ib.l3_qpn = ib_spec->ib.val.l3_type_qpn;
+ mlx4_spec->ib.qpn_mask = ib_spec->ib.mask.l3_type_qpn;
+ memcpy(&mlx4_spec->ib.dst_gid, ib_spec->ib.val.dst_gid, 16);
+ memcpy(&mlx4_spec->ib.dst_gid_msk,
+ ib_spec->ib.mask.dst_gid, 16);
+ break;
- err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
- !!(mqp->flags &
- MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
- MLX4_PROT_IB_IPV6, &reg_id);
- if (err)
- goto err_malloc;
+ case IB_FLOW_SPEC_IPV4:
+ type = MLX4_NET_TRANS_RULE_ID_IPV4;
+ mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
+ mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip;
+ mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip;
+ mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip;
+ break;
- err = add_gid_entry(ibqp, gid);
- if (err)
- goto err_add;
+ case IB_FLOW_SPEC_TCP:
+ case IB_FLOW_SPEC_UDP:
+ type = ib_spec->type == IB_FLOW_SPEC_TCP ?
+ MLX4_NET_TRANS_RULE_ID_TCP :
+ MLX4_NET_TRANS_RULE_ID_UDP;
+ mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
+ mlx4_spec->tcp_udp.dst_port_msk =
+ ib_spec->tcp_udp.mask.dst_port;
+ mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
+ mlx4_spec->tcp_udp.src_port_msk =
+ ib_spec->tcp_udp.mask.src_port;
+ break;
- if (ib_steering) {
- memcpy(ib_steering->gid.raw, gid->raw, 16);
- ib_steering->reg_id = reg_id;
- mutex_lock(&mqp->mutex);
- list_add(&ib_steering->list, &mqp->steering_rules);
- mutex_unlock(&mqp->mutex);
+ default:
+ return -EINVAL;
}
- return 0;
-
-err_add:
- mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
- MLX4_PROT_IB_IPV6, reg_id);
-err_malloc:
- kfree(ib_steering);
-
- return err;
+ if (map_sw_to_hw_steering_id(dev, type) < 0 ||
+ hw_rule_sz(dev, type) < 0)
+ return -EINVAL;
+ mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type));
+ mlx4_spec->size = hw_rule_sz(dev, type) >> 2;
+ return hw_rule_sz(dev, type);
}
-enum {
- IBV_FLOW_L4_NONE = 0,
- IBV_FLOW_L4_OTHER = 3,
- IBV_FLOW_L4_UDP = 5,
- IBV_FLOW_L4_TCP = 6
-};
-
-struct mlx4_cm_steering {
- struct list_head list;
- u64 reg_id;
- struct ib_flow_spec spec;
-};
-
-static int flow_spec_to_net_rule(struct ib_device *dev, struct ib_flow_spec *flow_spec,
- struct list_head *rule_list_h)
+static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+ int domain,
+ enum mlx4_net_trans_promisc_mode flow_type,
+ u64 *reg_id)
{
- struct mlx4_spec_list *spec_l2, *spec_l3, *spec_l4;
- u64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
-
- spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL);
- if (!spec_l2)
- return -ENOMEM;
+ int ret, i;
+ int size = 0;
+ void *ib_flow;
+ struct mlx4_ib_dev *mdev = to_mdev(qp->device);
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+ size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) +
+ (sizeof(struct _rule_hw) * flow_attr->num_of_specs);
+
+ static const u16 __mlx4_domain[] = {
+ [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
+ [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
+ [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
+ [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
+ };
- switch (flow_spec->type) {
- case IB_FLOW_ETH:
- spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
- memcpy(spec_l2->eth.dst_mac, flow_spec->l2_id.eth.mac, ETH_ALEN);
- memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
- spec_l2->eth.ether_type = flow_spec->l2_id.eth.ethertype;
- if (flow_spec->l2_id.eth.vlan_present) {
- spec_l2->eth.vlan_id = flow_spec->l2_id.eth.vlan;
- spec_l2->eth.vlan_id_msk = cpu_to_be16(0x0fff);
- }
- break;
- case IB_FLOW_IB_UC:
- spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB;
- if(flow_spec->l2_id.ib_uc.qpn) {
- spec_l2->ib.l3_qpn = cpu_to_be32(flow_spec->l2_id.ib_uc.qpn);
- spec_l2->ib.qpn_msk = cpu_to_be32(0xffffff);
+ if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
+ pr_err("Invalid priority value.\n");
+ return -EINVAL;
}
- break;
- case IB_FLOW_IB_MC_IPV4:
- case IB_FLOW_IB_MC_IPV6:
- spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB;
- memcpy(spec_l2->ib.dst_gid, flow_spec->l2_id.ib_mc.mgid, 16);
- memset(spec_l2->ib.dst_gid_msk, 0xff, 16);
- break;
+ if (domain >= IB_FLOW_DOMAIN_NUM) {
+ pr_err("Invalid domain value.\n");
+ return -EINVAL;
}
+ if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+ return -EINVAL;
+ mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ memset(mailbox->buf, 0, rule_size);
+ ctrl = mailbox->buf;
+
+ ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
+ flow_attr->priority);
+ ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+ ctrl->port = flow_attr->port;
+ ctrl->qpn = cpu_to_be32(qp->qp_num);
+
+ if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK)
+ ctrl->flags = (1 << 3);
+
+ ib_flow = flow_attr + 1;
+ size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+ for (i = 0; i < flow_attr->num_of_specs; i++) {
+ ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size);
+ if (ret < 0) {
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return -EINVAL;
+ }
+ ib_flow += ((union ib_flow_spec *)ib_flow)->size;
+ size += ret;
+ }
- list_add_tail(&spec_l2->list, rule_list_h);
+ ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
+ MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+ MLX4_CMD_NATIVE);
+ if (ret == -ENOMEM)
+ pr_err("mcg table is full. Fail to register network rule.\n");
+ else if (ret == -ENXIO)
+ pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
+ else if (ret)
+ pr_err("Invalid argumant. Fail to register network rule.\n");
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return ret;
+}
- if (flow_spec->l2_id.eth.ethertype == cpu_to_be16(ETH_P_IP) ||
- flow_spec->type != IB_FLOW_ETH) {
- spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
- if (!spec_l3)
- return -ENOMEM;
+static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
+{
+ int err;
+ err = mlx4_cmd(dev, reg_id, 0, 0,
+ MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+ MLX4_CMD_NATIVE);
+ if (err)
+ pr_err("Fail to detach network rule. registration id = 0x%llx\n",
+ (unsigned long long)reg_id);
+ return err;
+}
- spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
- spec_l3->ipv4.src_ip = flow_spec->src_ip;
- if (flow_spec->type != IB_FLOW_IB_MC_IPV4 &&
- flow_spec->type != IB_FLOW_IB_MC_IPV6)
- spec_l3->ipv4.dst_ip = flow_spec->dst_ip;
+static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr,
+ int domain)
+{
+ int err = 0, i = 0;
+ struct mlx4_ib_flow *mflow;
+ enum mlx4_net_trans_promisc_mode type[2];
- if (spec_l3->ipv4.src_ip)
- spec_l3->ipv4.src_ip_msk = MLX4_BE_WORD_MASK;
- if (spec_l3->ipv4.dst_ip)
- spec_l3->ipv4.dst_ip_msk = MLX4_BE_WORD_MASK;
+ memset(type, 0, sizeof(type));
- list_add_tail(&spec_l3->list, rule_list_h);
+ mflow = kzalloc(sizeof(struct mlx4_ib_flow), GFP_KERNEL);
+ if (!mflow) {
+ err = -ENOMEM;
+ goto err_free;
}
- if (flow_spec->l4_protocol) {
- spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL);
- if (!spec_l4)
- return -ENOMEM;
-
- spec_l4->tcp_udp.src_port = flow_spec->src_port;
- spec_l4->tcp_udp.dst_port = flow_spec->dst_port;
- if (spec_l4->tcp_udp.src_port)
- spec_l4->tcp_udp.src_port_msk =
- MLX4_BE_SHORT_MASK;
- if (spec_l4->tcp_udp.dst_port)
- spec_l4->tcp_udp.dst_port_msk =
- MLX4_BE_SHORT_MASK;
-
- switch (flow_spec->l4_protocol) {
- case IBV_FLOW_L4_UDP:
- spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP;
+ switch (flow_attr->type) {
+ case IB_FLOW_ATTR_NORMAL:
+ type[0] = MLX4_FS_REGULAR;
break;
- case IBV_FLOW_L4_TCP:
- spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP;
- break;
- default:
- dev_err(dev->dma_device,
- "Unsupported l4 protocol.\n");
- kfree(spec_l4);
- return -EPROTONOSUPPORT;
- }
- list_add_tail(&spec_l4->list, rule_list_h);
- }
- return 0;
-}
-
-static int __mlx4_ib_flow_attach(struct mlx4_ib_dev *mdev,
- struct mlx4_ib_qp *mqp,
- struct ib_flow_spec *flow_spec,
- int priority, int lock_qp)
-{
- u64 reg_id = 0;
- int err = 0;
- struct mlx4_cm_steering *cm_flow;
- struct mlx4_spec_list *spec, *tmp_spec;
- struct mlx4_net_trans_rule rule =
- { .queue_mode = MLX4_NET_TRANS_Q_FIFO,
- .exclusive = 0,
- };
+ case IB_FLOW_ATTR_ALL_DEFAULT:
+ type[0] = MLX4_FS_ALL_DEFAULT;
+ break;
- rule.promisc_mode = flow_spec->rule_type;
- rule.port = mqp->port;
- rule.qpn = mqp->mqp.qpn;
- INIT_LIST_HEAD(&rule.list);
+ case IB_FLOW_ATTR_MC_DEFAULT:
+ type[0] = MLX4_FS_MC_DEFAULT;
+ break;
- cm_flow = kmalloc(sizeof(*cm_flow), GFP_KERNEL);
- if (!cm_flow)
- return -ENOMEM;
+ case IB_FLOW_ATTR_SNIFFER:
+ type[0] = MLX4_FS_UC_SNIFFER;
+ type[1] = MLX4_FS_MC_SNIFFER;
+ break;
- if (rule.promisc_mode == MLX4_FS_REGULAR) {
- rule.allow_loopback = !flow_spec->block_mc_loopback;
- rule.priority = MLX4_DOMAIN_UVERBS | priority;
- err = flow_spec_to_net_rule(&mdev->ib_dev, flow_spec,
- &rule.list);
- if (err)
- goto free_list;
+ default:
+ err = -EINVAL;
+ goto err_free;
}
- err = mlx4_flow_attach(mdev->dev, &rule, &reg_id);
+ while (i < ARRAY_SIZE(type) && type[i]) {
+ err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
+ &mflow->reg_id[i]);
if (err)
- goto free_list;
-
- memcpy(&cm_flow->spec, flow_spec, sizeof(*flow_spec));
- cm_flow->reg_id = reg_id;
+ goto err_free;
+ i++;
+ }
- if (lock_qp)
- mutex_lock(&mqp->mutex);
- list_add(&cm_flow->list, &mqp->rules_list);
- if (lock_qp)
- mutex_unlock(&mqp->mutex);
+ return &mflow->ibflow;
-free_list:
- list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) {
- list_del(&spec->list);
- kfree(spec);
- }
- if (err) {
- kfree(cm_flow);
- dev_err(mdev->ib_dev.dma_device,
- "Fail to attach flow steering rule\n");
- }
- return err;
+err_free:
+ kfree(mflow);
+ return ERR_PTR(err);
}
-static int __mlx4_ib_flow_detach(struct mlx4_ib_dev *mdev,
- struct mlx4_ib_qp *mqp,
- struct ib_flow_spec *spec, int priority,
- int lock_qp)
+static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
{
- struct mlx4_cm_steering *cm_flow;
- int ret;
+ int err, ret = 0;
+ int i = 0;
+ struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
+ struct mlx4_ib_flow *mflow = to_mflow(flow_id);
- if (lock_qp)
- mutex_lock(&mqp->mutex);
- list_for_each_entry(cm_flow, &mqp->rules_list, list) {
- if (!memcmp(&cm_flow->spec, spec, sizeof(*spec))) {
- list_del(&cm_flow->list);
- break;
- }
- }
- if (lock_qp)
- mutex_unlock(&mqp->mutex);
-
- if (&cm_flow->list == &mqp->rules_list) {
- dev_err(mdev->ib_dev.dma_device, "Couldn't find reg_id for flow spec. "
- "Steering rule is left attached\n");
- return -EINVAL;
+ while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) {
+ err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]);
+ if (err)
+ ret = err;
+ i++;
}
- ret = mlx4_flow_detach(mdev->dev, cm_flow->reg_id);
-
- kfree(cm_flow);
+ kfree(mflow);
return ret;
}
-static int mlx4_ib_flow_attach(struct ib_qp *qp, struct ib_flow_spec *flow_spec,
- int priority)
-{
- return __mlx4_ib_flow_attach(to_mdev(qp->device), to_mqp(qp),
- flow_spec, priority, 1);
-}
-
-static int mlx4_ib_flow_detach(struct ib_qp *qp, struct ib_flow_spec *spec,
- int priority)
-{
- return __mlx4_ib_flow_detach(to_mdev(qp->device), to_mqp(qp),
- spec, priority, 1);
-}
-
static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
{
struct mlx4_ib_gid_entry *ge;
@@ -1194,40 +1267,14 @@ static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
return ret;
}
-static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+
+static int del_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
{
- int err;
struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
struct mlx4_ib_qp *mqp = to_mqp(ibqp);
- u8 mac[6];
- struct net_device *ndev;
struct mlx4_ib_gid_entry *ge;
- u64 reg_id = 0;
-
- if (mdev->dev->caps.steering_mode ==
- MLX4_STEERING_MODE_DEVICE_MANAGED) {
- struct mlx4_ib_steering *ib_steering;
-
- mutex_lock(&mqp->mutex);
- list_for_each_entry(ib_steering, &mqp->steering_rules, list) {
- if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) {
- list_del(&ib_steering->list);
- break;
- }
- }
- mutex_unlock(&mqp->mutex);
- if (&ib_steering->list == &mqp->steering_rules) {
- pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n");
- return -EINVAL;
- }
- reg_id = ib_steering->reg_id;
- kfree(ib_steering);
- }
-
- err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
- MLX4_PROT_IB_IPV6, reg_id);
- if (err)
- return err;
+ struct net_device *ndev;
+ u8 mac[6];
mutex_lock(&mqp->mutex);
ge = find_gid_entry(mqp, gid->raw);
@@ -1250,8 +1297,174 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
pr_warn("could not find mgid entry\n");
mutex_unlock(&mqp->mutex);
+ return ge != 0 ? 0 : -EINVAL;
+}
+
+static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid,
+ int count)
+{
+ int err;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ u64 reg_id = 0;
+ int record_err = 0;
+
+ if (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ struct mlx4_ib_steering *ib_steering;
+ struct mlx4_ib_steering *tmp;
+ LIST_HEAD(temp);
+
+ mutex_lock(&mqp->mutex);
+ list_for_each_entry_safe(ib_steering, tmp, &mqp->steering_rules,
+ list) {
+ if (memcmp(ib_steering->gid.raw, gid->raw, 16))
+ continue;
+
+ if (--count < 0)
+ break;
+
+ list_del(&ib_steering->list);
+ list_add(&ib_steering->list, &temp);
+ }
+ mutex_unlock(&mqp->mutex);
+ list_for_each_entry_safe(ib_steering, tmp, &temp,
+ list) {
+ reg_id = ib_steering->reg_id;
+
+ err = mlx4_multicast_detach(mdev->dev, &mqp->mqp,
+ gid->raw,
+ (ibqp->qp_type == IB_QPT_RAW_PACKET) ?
+ MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
+ reg_id);
+ if (err) {
+ record_err = record_err ?: err;
+ continue;
+ }
+
+ err = del_gid_entry(ibqp, gid);
+ if (err) {
+ record_err = record_err ?: err;
+ continue;
+ }
+
+ list_del(&ib_steering->list);
+ kfree(ib_steering);
+ }
+ mutex_lock(&mqp->mutex);
+ list_for_each_entry(ib_steering, &temp, list) {
+ list_add(&ib_steering->list, &mqp->steering_rules);
+ }
+ mutex_unlock(&mqp->mutex);
+ if (count) {
+ pr_warn("Couldn't release all reg_ids for mgid. Steering rule is left attached\n");
+ return -EINVAL;
+ }
+
+ } else {
+ if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 &&
+ ibqp->qp_type == IB_QPT_RAW_PACKET)
+ gid->raw[5] = mqp->port;
+
+ err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ (ibqp->qp_type == IB_QPT_RAW_PACKET) ?
+ MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
+ reg_id);
+ if (err)
+ return err;
+
+ err = del_gid_entry(ibqp, gid);
+
+ if (err)
+ return err;
+ }
+
+ return record_err;
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ int count = (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) ?
+ mdev->dev->caps.num_ports : 1;
+
+ return _mlx4_ib_mcg_detach(ibqp, gid, lid, count);
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ int err = -ENODEV;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
+ int i = 0;
+
+ if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 &&
+ ibqp->qp_type == IB_QPT_RAW_PACKET)
+ gid->raw[5] = mqp->port;
+
+ if (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ bitmap_fill(ports, mdev->dev->caps.num_ports);
+ } else {
+ if (mqp->port <= mdev->dev->caps.num_ports) {
+ bitmap_zero(ports, mdev->dev->caps.num_ports);
+ set_bit(0, ports);
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ for (; i < mdev->dev->caps.num_ports; i++) {
+ u64 reg_id;
+ struct mlx4_ib_steering *ib_steering = NULL;
+ if (!test_bit(i, ports))
+ continue;
+ if (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+ if (!ib_steering)
+ goto err_add;
+ }
+
+ err = mlx4_multicast_attach(mdev->dev, &mqp->mqp,
+ gid->raw, i + 1,
+ !!(mqp->flags &
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+ (ibqp->qp_type == IB_QPT_RAW_PACKET) ?
+ MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
+ &reg_id);
+ if (err) {
+ kfree(ib_steering);
+ goto err_add;
+ }
+
+ err = add_gid_entry(ibqp, gid);
+ if (err) {
+ mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ MLX4_PROT_IB_IPV6, reg_id);
+ kfree(ib_steering);
+ goto err_add;
+ }
+
+ if (ib_steering) {
+ memcpy(ib_steering->gid.raw, gid->raw, 16);
+ mutex_lock(&mqp->mutex);
+ list_add(&ib_steering->list, &mqp->steering_rules);
+ mutex_unlock(&mqp->mutex);
+ ib_steering->reg_id = reg_id;
+ }
+ }
+
return 0;
+
+err_add:
+ if (i > 0)
+ _mlx4_ib_mcg_detach(ibqp, gid, lid, i);
+
+ return err;
}
static int init_node_data(struct mlx4_ib_dev *dev)
@@ -1327,27 +1540,39 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
dev->dev->board_id);
}
+static ssize_t show_vsd(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ ssize_t len = MLX4_VSD_LEN;
+
+ if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX)
+ len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd);
+ else
+ memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN);
+
+ return len;
+}
+
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static DEVICE_ATTR(vsd, S_IRUGO, show_vsd, NULL);
static struct device_attribute *mlx4_class_attributes[] = {
&dev_attr_hw_rev,
&dev_attr_fw_ver,
&dev_attr_hca_type,
- &dev_attr_board_id
+ &dev_attr_board_id,
+ &dev_attr_vsd
};
-static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev)
+static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port)
{
-#ifdef __linux__
- memcpy(eui, dev->dev_addr, 3);
- memcpy(eui + 5, dev->dev_addr + 3, 3);
-#else
memcpy(eui, IF_LLADDR(dev), 3);
memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
-#endif
if (vlan_id < 0x1000) {
eui[3] = vlan_id >> 8;
eui[4] = vlan_id & 0xff;
@@ -1366,191 +1591,352 @@ static void update_gids_task(struct work_struct *work)
int err;
struct mlx4_dev *dev = gw->dev->dev;
+
mailbox = mlx4_alloc_cmd_mailbox(dev);
if (IS_ERR(mailbox)) {
pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
- return;
+ goto free;
}
gids = mailbox->buf;
memcpy(gids, gw->gids, sizeof gw->gids);
- err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
+ if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
+ IB_LINK_LAYER_ETHERNET) {
+ err = mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
MLX4_CMD_WRAPPED);
+
if (err)
pr_warn("set port command failed\n");
- else {
- memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids);
- mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE);
+ else
+ mlx4_ib_dispatch_event(gw->dev, gw->port,
+ IB_EVENT_GID_CHANGE);
+ }
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+free:
+ kfree(gw);
+}
+
+static void reset_gids_task(struct work_struct *work)
+{
+ struct update_gid_work *gw =
+ container_of(work, struct update_gid_work, work);
+ struct mlx4_cmd_mailbox *mailbox;
+ union ib_gid *gids;
+ int err;
+ struct mlx4_dev *dev = gw->dev->dev;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox)) {
+ pr_warn("reset gid table failed\n");
+ goto free;
+ }
+
+ gids = mailbox->buf;
+ memcpy(gids, gw->gids, sizeof(gw->gids));
+
+ if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 1) ==
+ IB_LINK_LAYER_ETHERNET &&
+ dev->caps.num_ports > 0) {
+ err = mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_GID_TABLE << 8 | 1,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+ if (err)
+ pr_warn("set port 1 command failed\n");
+ }
+
+ if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 2) ==
+ IB_LINK_LAYER_ETHERNET &&
+ dev->caps.num_ports > 1) {
+ err = mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_GID_TABLE << 8 | 2,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+ if (err)
+ pr_warn("set port 2 command failed\n");
}
mlx4_free_cmd_mailbox(dev, mailbox);
+free:
kfree(gw);
}
-static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear)
+static int update_gid_table(struct mlx4_ib_dev *dev, int port,
+ union ib_gid *gid, int clear, int default_gid)
{
- struct net_device *ndev = dev->iboe.netdevs[port - 1];
struct update_gid_work *work;
- struct net_device *tmp;
int i;
- u8 *hits;
- union ib_gid gid;
- int index_free;
- int found;
int need_update = 0;
+ int free = -1;
+ int found = -1;
int max_gids;
- u16 vid;
-
- work = kzalloc(sizeof *work, GFP_ATOMIC);
- if (!work)
- return -ENOMEM;
-
- hits = kzalloc(128, GFP_ATOMIC);
- if (!hits) {
- kfree(work);
- return -ENOMEM;
- }
+ int start_index = !default_gid;
max_gids = dev->dev->caps.gid_table_len[port];
-
-#ifdef __linux__
- rcu_read_lock();
- for_each_netdev_rcu(&init_net, tmp) {
-#else
- IFNET_RLOCK();
- TAILQ_FOREACH(tmp, &V_ifnet, if_link) {
-#endif
- if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) {
- gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
- vid = rdma_vlan_dev_vlan_id(tmp);
- mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev);
- found = 0;
- index_free = -1;
- for (i = 0; i < max_gids; ++i) {
- if (index_free < 0 &&
- !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
- index_free = i;
- if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) {
- hits[i] = 1;
- found = 1;
+ for (i = start_index; i < max_gids; ++i) {
+ if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
+ sizeof(*gid)))
+ found = i;
+
+ if (clear) {
+ if (found >= 0) {
+ need_update = 1;
+ dev->iboe.gid_table[port - 1][found] = zgid;
break;
}
- }
+ } else {
+ if (found >= 0)
+ break;
- if (!found) {
- if (tmp == ndev &&
- (memcmp(&dev->iboe.gid_table[port - 1][0],
- &gid, sizeof gid) ||
- !memcmp(&dev->iboe.gid_table[port - 1][0],
- &zgid, sizeof gid))) {
- dev->iboe.gid_table[port - 1][0] = gid;
- ++need_update;
- hits[0] = 1;
- } else if (index_free >= 0) {
- dev->iboe.gid_table[port - 1][index_free] = gid;
- hits[index_free] = 1;
- ++need_update;
+ if (free < 0 &&
+ !memcmp(&dev->iboe.gid_table[port - 1][i],
+ &zgid, sizeof(*gid)))
+ free = i;
}
}
+
+ if (found == -1 && !clear && free < 0) {
+ pr_err("GID table of port %d is full. Can't add "GID_PRINT_FMT"\n",
+ port, GID_PRINT_ARGS(gid));
+ return -ENOMEM;
}
-#ifdef __linux__
+ if (found == -1 && clear) {
+ pr_err(GID_PRINT_FMT" is not in GID table of port %d\n", GID_PRINT_ARGS(gid), port);
+ return -EINVAL;
}
- rcu_read_unlock();
-#else
+ if (found == -1 && !clear && free >= 0) {
+ dev->iboe.gid_table[port - 1][free] = *gid;
+ need_update = 1;
}
- IFNET_RUNLOCK();
-#endif
- for (i = 0; i < max_gids; ++i)
- if (!hits[i]) {
- if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
- ++need_update;
- dev->iboe.gid_table[port - 1][i] = zgid;
- }
+ if (!need_update)
+ return 0;
+
+ work = kzalloc(sizeof *work, GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
- if (need_update) {
- memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids);
+ memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
INIT_WORK(&work->work, update_gids_task);
work->port = port;
work->dev = dev;
queue_work(wq, &work->work);
- } else
- kfree(work);
- kfree(hits);
return 0;
}
-static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event)
+static int reset_gid_table(struct mlx4_ib_dev *dev)
{
- switch (event) {
- case NETDEV_UP:
-#ifdef __linux__
- case NETDEV_CHANGEADDR:
+ struct update_gid_work *work;
+
+
+ work = kzalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
+
+ memset(dev->iboe.gid_table, 0, sizeof(dev->iboe.gid_table));
+ memset(work->gids, 0, sizeof(work->gids));
+ INIT_WORK(&work->work, reset_gids_task);
+ work->dev = dev;
+ queue_work(wq, &work->work);
+ return 0;
+}
+
+/* XXX BOND Related - stub (no support for these flags in FBSD)*/
+static inline int netif_is_bond_master(struct net_device *dev)
+{
+#if 0
+ return (dev->flags & IFF_MASTER) && (dev->priv_flags & IFF_BONDING);
#endif
- update_ipv6_gids(dev, port, 0);
+ return 0;
+}
+
+static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid, u8 port)
+{
+ gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev, port);
+}
+
+static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev)
+{
+ u8 port = 0;
+ struct mlx4_ib_iboe *iboe;
+ struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
+ rdma_vlan_dev_real_dev(dev) : dev;
+
+ iboe = &ibdev->iboe;
+
+ for (port = 1; port <= MLX4_MAX_PORTS; ++port)
+ if ((netif_is_bond_master(real_dev) && (real_dev == iboe->masters[port - 1])) ||
+ (!netif_is_bond_master(real_dev) && (real_dev == iboe->netdevs[port - 1])))
break;
- case NETDEV_DOWN:
- update_ipv6_gids(dev, port, 1);
- dev->iboe.netdevs[port - 1] = NULL;
+ return port > MLX4_MAX_PORTS ? 0 : port;
+}
+
+static void mlx4_ib_get_dev_addr(struct net_device *dev, struct mlx4_ib_dev *ibdev, u8 port)
+{
+ struct ifaddr *ifa;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ struct inet6_dev *in6_dev;
+ union ib_gid *pgid;
+ struct inet6_ifaddr *ifp;
+#endif
+ union ib_gid gid;
+
+
+ if ((port == 0) || (port > MLX4_MAX_PORTS))
+ return;
+
+ /* IPv4 gids */
+ TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET){
+ ipv6_addr_set_v4mapped(
+ ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr,
+ (struct in6_addr *)&gid);
+ update_gid_table(ibdev, port, &gid, 0, 0);
+ }
+
+ }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ /* IPv6 gids */
+ in6_dev = in6_dev_get(dev);
+ if (in6_dev) {
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ pgid = (union ib_gid *)&ifp->addr;
+ update_gid_table(ibdev, port, pgid, 0, 0);
}
+ read_unlock_bh(&in6_dev->lock);
+ in6_dev_put(in6_dev);
+ }
+#endif
}
-static void netdev_added(struct mlx4_ib_dev *dev, int port)
+static void mlx4_set_default_gid(struct mlx4_ib_dev *ibdev,
+ struct net_device *dev, u8 port)
{
- update_ipv6_gids(dev, port, 0);
+ union ib_gid gid;
+ mlx4_make_default_gid(dev, &gid, port);
+ update_gid_table(ibdev, port, &gid, 0, 1);
}
-static void netdev_removed(struct mlx4_ib_dev *dev, int port)
+static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
{
- update_ipv6_gids(dev, port, 1);
+ struct net_device *dev;
+
+ if (reset_gid_table(ibdev))
+ return -1;
+
+ IFNET_RLOCK_NOSLEEP();
+ TAILQ_FOREACH(dev, &V_ifnet, if_link) {
+ u8 port = mlx4_ib_get_dev_port(dev, ibdev);
+ if (port) {
+ if (!rdma_vlan_dev_real_dev(dev) &&
+ !netif_is_bond_master(dev))
+ mlx4_set_default_gid(ibdev, dev, port);
+ mlx4_ib_get_dev_addr(dev, ibdev, port);
+ }
+ }
+
+ IFNET_RUNLOCK_NOSLEEP();
+
+ return 0;
}
-static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event,
- void *ptr)
+static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
+ struct net_device *dev, unsigned long event)
{
- struct net_device *dev = ptr;
- struct mlx4_ib_dev *ibdev;
- struct net_device *oldnd;
struct mlx4_ib_iboe *iboe;
int port;
+ int init = 0;
+ unsigned long flags;
-#ifdef __linux__
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-#endif
-
- ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
iboe = &ibdev->iboe;
- spin_lock(&iboe->lock);
+ spin_lock_irqsave(&iboe->lock, flags);
mlx4_foreach_ib_transport_port(port, ibdev->dev) {
- oldnd = iboe->netdevs[port - 1];
+ struct net_device *old_netdev = iboe->netdevs[port - 1];
+/* XXX BOND related */
+#if 0
+ struct net_device *old_master = iboe->masters[port - 1];
+#endif
+ iboe->masters[port - 1] = NULL;
iboe->netdevs[port - 1] =
mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
- if (oldnd != iboe->netdevs[port - 1]) {
- if (iboe->netdevs[port - 1])
- netdev_added(ibdev, port);
- else
- netdev_removed(ibdev, port);
- }
+
+
+ if (old_netdev != iboe->netdevs[port - 1])
+ init = 1;
+ if (dev == iboe->netdevs[port - 1] &&
+ event == NETDEV_CHANGEADDR)
+ init = 1;
+/* XXX BOND related */
+#if 0
+ if (iboe->netdevs[port - 1] && netif_is_bond_slave(iboe->netdevs[port - 1]))
+ iboe->masters[port - 1] = iboe->netdevs[port - 1]->master;
+
+ /* if bonding is used it is possible that we add it to masters only after
+ IP address is assigned to the net bonding interface */
+ if (old_master != iboe->masters[port - 1])
+ init = 1;
+#endif
}
- if (dev == iboe->netdevs[0] ||
- (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0]))
- handle_en_event(ibdev, 1, event);
- else if (dev == iboe->netdevs[1]
- || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1]))
- handle_en_event(ibdev, 2, event);
+ spin_unlock_irqrestore(&iboe->lock, flags);
+
+ if (init)
+ if (mlx4_ib_init_gid_table(ibdev))
+ pr_warn("Fail to reset gid table\n");
+}
+
+static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct mlx4_ib_dev *ibdev;
- spin_unlock(&iboe->lock);
+ ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+
+ mlx4_ib_scan_netdevs(ibdev, dev, event);
return NOTIFY_DONE;
}
+/* This function initializes the gid table only if the event_netdev real device is an iboe
+ * device, will be invoked by the inet/inet6 events */
+static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *event_netdev = ptr;
+ struct mlx4_ib_dev *ibdev;
+ struct mlx4_ib_iboe *ibdev_iboe;
+ int port = 0;
+
+ ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
+
+ struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
+ rdma_vlan_dev_real_dev(event_netdev) :
+ event_netdev;
+
+ ibdev_iboe = &ibdev->iboe;
+
+ port = mlx4_ib_get_dev_port(real_dev, ibdev);
+
+ /* Perform init_gid_table if the event real_dev is the net_device which represents this port,
+ * otherwise this event is not related and would be ignored.*/
+ if(port && (real_dev == ibdev_iboe->netdevs[port - 1]))
+ if (mlx4_ib_init_gid_table(ibdev))
+ pr_warn("Fail to reset gid table\n");
+
+ return NOTIFY_DONE;
+}
+
+
static void init_pkeys(struct mlx4_ib_dev *ibdev)
{
int port;
@@ -1615,7 +2001,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
eq = 0;
mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
for (j = 0; j < eq_per_port; j++) {
- snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j,
+ sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j,
pci_get_domain(dev->pdev->dev.bsddev),
pci_get_bus(dev->pdev->dev.bsddev),
PCI_SLOT(dev->pdev->devfn),
@@ -1779,89 +2165,61 @@ static struct attribute_group diag_counters_group = {
.attrs = diag_rprt_attrs
};
-#ifdef __linux__
-static int mlx4_ib_proc_init(void)
+static void init_dev_assign(void)
{
- /* Creating procfs directories /proc/drivers/mlx4_ib/ &&
- /proc/drivers/mlx4_ib/mrs for further use by the driver.
- */
- int err;
+ int i = 1;
- mlx4_ib_driver_dir_entry = proc_mkdir(MLX4_IB_DRIVER_PROC_DIR_NAME,
- NULL);
- if (!mlx4_ib_driver_dir_entry) {
- pr_err("mlx4_ib_proc_init has failed for %s\n",
- MLX4_IB_DRIVER_PROC_DIR_NAME);
- err = -ENODEV;
- goto error;
+ spin_lock_init(&dev_num_str_lock);
+ if (mlx4_fill_dbdf2val_tbl(&dev_assign_str))
+ return;
+ dev_num_str_bitmap =
+ kmalloc(BITS_TO_LONGS(MAX_NUM_STR_BITMAP) * sizeof(long),
+ GFP_KERNEL);
+ if (!dev_num_str_bitmap) {
+ pr_warn("bitmap alloc failed -- cannot apply dev_assign_str parameter\n");
+ return;
}
-
- mlx4_mrs_dir_entry = proc_mkdir(MLX4_IB_MRS_PROC_DIR_NAME,
- mlx4_ib_driver_dir_entry);
- if (!mlx4_mrs_dir_entry) {
- pr_err("mlx4_ib_proc_init has failed for %s\n",
- MLX4_IB_MRS_PROC_DIR_NAME);
- err = -ENODEV;
- goto remove_entry;
+ bitmap_zero(dev_num_str_bitmap, MAX_NUM_STR_BITMAP);
+ while ((i < MLX4_DEVS_TBL_SIZE) && (dev_assign_str.tbl[i].dbdf !=
+ MLX4_ENDOF_TBL)) {
+ if (bitmap_allocate_region(dev_num_str_bitmap,
+ dev_assign_str.tbl[i].val[0], 0))
+ goto err;
+ i++;
}
+ dr_active = 1;
+ return;
- return 0;
-
-remove_entry:
- remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME,
- NULL);
-error:
- return err;
+err:
+ kfree(dev_num_str_bitmap);
+ dev_num_str_bitmap = NULL;
+ pr_warn("mlx4_ib: The value of 'dev_assign_str' parameter "
+ "is incorrect. The parameter value is discarded!");
}
-#endif
-static void init_dev_assign(void)
+static int mlx4_ib_dev_idx(struct mlx4_dev *dev)
{
- int bus, slot, fn, ib_idx;
- char *p = dev_assign_str, *t;
- char curr_val[32] = {0};
- int ret;
- int j, i = 0;
-
- memset(dr, 0, sizeof dr);
-
- if (dev_assign_str[0] == 0)
- return;
-
- while (strlen(p)) {
- ret = sscanf(p, "%02x:%02x.%x-%x", &bus, &slot, &fn, &ib_idx);
- if (ret != 4 || ib_idx < 0)
- goto err;
-
- for (j = 0; j < i; j++)
- if (dr[j].nr == ib_idx)
- goto err;
-
- dr[i].bus = bus;
- dr[i].dev = slot;
- dr[i].func = fn;
- dr[i].nr = ib_idx;
-
- t = strchr(p, ',');
- sprintf(curr_val, "%02x:%02x.%x-%x", bus, slot, fn, ib_idx);
- if ((!t) && strlen(p) == strlen(curr_val))
- return;
-
- if (!t || (t + 1) >= dev_assign_str + sizeof dev_assign_str)
- goto err;
-
- ++i;
- if (i >= MAX_DR)
- goto err;
-
- p = t + 1;
+ int i, val;
+
+ if (!dr_active)
+ return -1;
+ if (!dev)
+ return -1;
+ if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val))
+ return -1;
+
+ if (val != DEFAULT_TBL_VAL) {
+ dev->flags |= MLX4_FLAG_DEV_NUM_STR;
+ return val;
}
- return;
-err:
- memset(dr, 0, sizeof dr);
- printk(KERN_WARNING "mlx4_ib: The value of 'dev_assign_str' parameter "
- "is incorrect. The parameter value is discarded!");
+ spin_lock(&dev_num_str_lock);
+ i = bitmap_find_free_region(dev_num_str_bitmap, MAX_NUM_STR_BITMAP, 0);
+ spin_unlock(&dev_num_str_lock);
+ if (i >= 0)
+ return i;
+
+ return -1;
}
static void *mlx4_ib_add(struct mlx4_dev *dev)
@@ -1871,8 +2229,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
int i, j;
int err;
struct mlx4_ib_iboe *iboe;
+ int dev_idx;
- printk(KERN_INFO "%s", mlx4_ib_version);
+ pr_info_once("%s", mlx4_ib_version);
mlx4_foreach_ib_transport_port(i, dev)
num_ports++;
@@ -1905,7 +2264,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->dev = dev;
+ dev_idx = mlx4_ib_dev_idx(dev);
+ if (dev_idx >= 0)
+ sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx);
+ else
strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+
ibdev->ib_dev.owner = THIS_MODULE;
ibdev->ib_dev.node_type = RDMA_NODE_IB_CA;
ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey;
@@ -1942,10 +2306,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
(1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) |
- (1ull << IB_USER_VERBS_CMD_OPEN_QP) |
- (1ull << IB_USER_VERBS_CMD_ATTACH_FLOW) |
- (1ull << IB_USER_VERBS_CMD_DETACH_FLOW) |
- (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+ (1ull << IB_USER_VERBS_CMD_OPEN_QP);
ibdev->ib_dev.query_device = mlx4_ib_query_device;
ibdev->ib_dev.query_port = mlx4_ib_query_port;
@@ -1957,7 +2318,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext;
ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext;
ibdev->ib_dev.mmap = mlx4_ib_mmap;
-#ifdef __linux__
+/* XXX FBSD has no support for get_unmapped_area function */
+#if 0
ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area;
#endif
ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd;
@@ -1990,9 +2352,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list;
ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach;
ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
- ibdev->ib_dev.attach_flow = mlx4_ib_flow_attach;
- ibdev->ib_dev.detach_flow = mlx4_ib_flow_detach;
ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
+ ibdev->ib_dev.ioctl = mlx4_ib_ioctl;
+ ibdev->ib_dev.query_values = mlx4_ib_query_values;
if (!mlx4_is_slave(ibdev->dev)) {
ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
@@ -2001,6 +2363,16 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc;
}
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) {
+ ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
+ ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
+ ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
+
+ ibdev->ib_dev.uverbs_cmd_mask |=
+ (1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+ }
+
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
@@ -2009,6 +2381,29 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
}
+ /*
+ * Set experimental data
+ */
+ ibdev->ib_dev.uverbs_exp_cmd_mask =
+ (1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ) |
+ (1ull << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ);
+ ibdev->ib_dev.exp_create_qp = mlx4_ib_exp_create_qp;
+ ibdev->ib_dev.exp_query_device = mlx4_ib_exp_query_device;
+ if (check_flow_steering_support(dev)) {
+ ibdev->ib_dev.uverbs_ex_cmd_mask |=
+ (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+ (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+ ibdev->ib_dev.create_flow = mlx4_ib_create_flow;
+ ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow;
+ } else {
+ pr_debug("Device managed flow steering is unavailable for this configuration.\n");
+ }
+ /*
+ * End of experimental data
+ */
+
mlx4_ib_alloc_eqs(dev, ibdev);
spin_lock_init(&iboe->lock);
@@ -2019,18 +2414,29 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
for (i = 0; i < ibdev->num_ports; ++i) {
if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
IB_LINK_LAYER_ETHERNET) {
- err = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i]);
- if (err)
- ibdev->counters[i] = -1;
- } else
- ibdev->counters[i] = -1;
+ if (mlx4_is_slave(dev)) {
+ ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev,
+ i + 1,
+ &ibdev->counters[i].counter_index);
+ } else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */
+ ibdev->counters[i].counter_index = ((i + 1) << 1) - 1;
+ ibdev->counters[i].status = 0;
+ }
+
+ dev_info(&dev->pdev->dev,
+ "%s: allocated counter index %d for port %d\n",
+ __func__, ibdev->counters[i].counter_index, i+1);
+ } else {
+ ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX;
+ ibdev->counters[i].status = -ENOSPC;
+ }
}
spin_lock_init(&ibdev->sm_lock);
mutex_init(&ibdev->cap_mask_mutex);
if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
- !mlx4_is_slave(dev)) {
+ !mlx4_is_mfunc(dev)) {
ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0);
@@ -2063,20 +2469,32 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
if (mlx4_ib_init_sriov(ibdev))
goto err_mad;
- if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) {
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+ if (!iboe->nb.notifier_call) {
iboe->nb.notifier_call = mlx4_ib_netdev_event;
err = register_netdevice_notifier(&iboe->nb);
- if (err)
- goto err_sriov;
+ if (err) {
+ iboe->nb.notifier_call = NULL;
+ goto err_notify;
+ }
+ }
+ if (!iboe->nb_inet.notifier_call) {
+ iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
+ err = register_inetaddr_notifier(&iboe->nb_inet);
+ if (err) {
+ iboe->nb_inet.notifier_call = NULL;
+ goto err_notify;
+ }
+ }
+ mlx4_ib_scan_netdevs(ibdev, NULL, 0);
}
-
for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
if (device_create_file(&ibdev->ib_dev.dev,
mlx4_class_attributes[j]))
- goto err_notif;
+ goto err_notify;
}
if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group))
- goto err_notif;
+ goto err_notify;
ibdev->ib_active = true;
@@ -2094,12 +2512,24 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
}
return ibdev;
-err_notif:
+err_notify:
+ for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
+ device_remove_file(&ibdev->ib_dev.dev,
+ mlx4_class_attributes[j]);
+ }
+
+ if (ibdev->iboe.nb.notifier_call) {
if (unregister_netdevice_notifier(&ibdev->iboe.nb))
pr_warn("failure unregistering notifier\n");
+ ibdev->iboe.nb.notifier_call = NULL;
+ }
+ if (ibdev->iboe.nb_inet.notifier_call) {
+ if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
+ pr_warn("failure unregistering notifier\n");
+ ibdev->iboe.nb_inet.notifier_call = NULL;
+ }
flush_workqueue(wq);
-err_sriov:
mlx4_ib_close_sriov(ibdev);
err_mad:
@@ -2116,9 +2546,14 @@ err_steer_qp_release:
mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
ibdev->steer_qpn_count);
err_counter:
- for (; i; --i)
- if (ibdev->counters[i - 1] != -1)
- mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1]);
+ for (; i; --i) {
+ if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) ==
+ IB_LINK_LAYER_ETHERNET) {
+ mlx4_counter_free(ibdev->dev,
+ i,
+ ibdev->counters[i - 1].counter_index);
+ }
+ }
err_map:
iounmap(ibdev->priv_uar.map);
@@ -2167,30 +2602,71 @@ void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
int is_attach)
{
- struct ib_flow_spec spec = {
- .type = IB_FLOW_IB_UC,
- .l2_id.ib_uc.qpn = mqp->ibqp.qp_num,
- };
-
- return is_attach ?
- __mlx4_ib_flow_attach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0)
- : __mlx4_ib_flow_detach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0);
+ int err;
+ size_t flow_size;
+ struct ib_flow_attr *flow = NULL;
+ struct ib_flow_spec_ib *ib_spec;
+
+ if (is_attach) {
+ flow_size = sizeof(struct ib_flow_attr) +
+ sizeof(struct ib_flow_spec_ib);
+ flow = kzalloc(flow_size, GFP_KERNEL);
+ if (!flow)
+ return -ENOMEM;
+ flow->port = mqp->port;
+ flow->num_of_specs = 1;
+ flow->size = flow_size;
+ ib_spec = (struct ib_flow_spec_ib *)(flow + 1);
+ ib_spec->type = IB_FLOW_SPEC_IB;
+ ib_spec->size = sizeof(struct ib_flow_spec_ib);
+ ib_spec->val.l3_type_qpn = mqp->ibqp.qp_num;
+ ib_spec->mask.l3_type_qpn = MLX4_IB_FLOW_QPN_MASK;
+
+ err = __mlx4_ib_create_flow(&mqp->ibqp, flow,
+ IB_FLOW_DOMAIN_NIC,
+ MLX4_FS_REGULAR,
+ &mqp->reg_id);
+ } else {
+ err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
+ }
+ kfree(flow);
+ return err;
}
static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
{
struct mlx4_ib_dev *ibdev = ibdev_ptr;
- int p,j;
+ int p, j;
+ int dev_idx, ret;
+
+ if (ibdev->iboe.nb_inet.notifier_call) {
+ if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
+ pr_warn("failure unregistering notifier\n");
+ ibdev->iboe.nb_inet.notifier_call = NULL;
+ }
mlx4_ib_close_sriov(ibdev);
sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group);
mlx4_ib_mad_cleanup(ibdev);
for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
- device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]);
+ device_remove_file(&ibdev->ib_dev.dev,
+ mlx4_class_attributes[j]);
}
+
+ dev_idx = -1;
+ if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) {
+ ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx);
+ if (ret != 1)
+ dev_idx = -1;
+ }
ib_unregister_device(&ibdev->ib_dev);
+ if (dev_idx >= 0) {
+ spin_lock(&dev_num_str_lock);
+ bitmap_release_region(dev_num_str_bitmap, dev_idx, 0);
+ spin_unlock(&dev_num_str_lock);
+ }
if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
@@ -2204,9 +2680,16 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
ibdev->iboe.nb.notifier_call = NULL;
}
iounmap(ibdev->priv_uar.map);
- for (p = 0; p < ibdev->num_ports; ++p)
- if (ibdev->counters[p] != -1)
- mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p]);
+
+ for (p = 0; p < ibdev->num_ports; ++p) {
+ if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) ==
+ IB_LINK_LAYER_ETHERNET) {
+ mlx4_counter_free(ibdev->dev,
+ p + 1,
+ ibdev->counters[p].counter_index);
+ }
+ }
+
mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
mlx4_CLOSE_PORT(dev, p);
@@ -2355,12 +2838,6 @@ static int __init mlx4_ib_init(void)
if (!wq)
return -ENOMEM;
-#ifdef __linux__
- err = mlx4_ib_proc_init();
- if (err)
- goto clean_wq;
-#endif
-
err = mlx4_ib_mcg_init();
if (err)
goto clean_proc;
@@ -2377,13 +2854,6 @@ clean_mcg:
mlx4_ib_mcg_destroy();
clean_proc:
-#ifdef __linux__
- remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME,
- mlx4_ib_driver_dir_entry);
- remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL);
-
-clean_wq:
-#endif
destroy_workqueue(wq);
return err;
}
@@ -2394,13 +2864,7 @@ static void __exit mlx4_ib_cleanup(void)
mlx4_ib_mcg_destroy();
destroy_workqueue(wq);
- /* Remove proc entries */
-#ifdef __linux__
- remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME,
- mlx4_ib_driver_dir_entry);
- remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL);
-#endif
-
+ kfree(dev_num_str_bitmap);
}
module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE);
@@ -2417,7 +2881,7 @@ static moduledata_t mlx4ib_mod = {
.evhand = mlx4ib_evhand,
};
-DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY);
+DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY);
MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1);
MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1);
MODULE_DEPEND(mlx4ib, linuxapi, 1, 1, 1);
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c b/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
index e70dfe9..207db3c 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
@@ -36,6 +36,7 @@
#include <rdma/ib_sa.h>
#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
#include <linux/delay.h>
#include "mlx4_ib.h"
@@ -53,6 +54,7 @@
#define mcg_error_group(group, format, arg...) \
pr_err(" %16s: " format, (group)->name, ## arg)
+
static union ib_gid mgid0;
static struct workqueue_struct *clean_wq;
@@ -214,7 +216,7 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
spin_unlock(&dev->sm_lock);
return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port,
- IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad);
+ IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, 0, mad);
}
static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
@@ -567,7 +569,7 @@ static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
group->state = MCAST_IDLE;
atomic_inc(&group->refcount);
- queue_work(group->demux->mcg_wq, &group->work);
+ if (!queue_work(group->demux->mcg_wq, &group->work))
safe_atomic_dec(&group->refcount);
mutex_unlock(&group->lock);
@@ -656,8 +658,9 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work)
method = group->response_sa_mad.mad_hdr.method;
if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
- (long long unsigned int)be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
- (long long unsigned int)be64_to_cpu(group->last_req_tid));
+ (long long)be64_to_cpu(
+ group->response_sa_mad.mad_hdr.tid),
+ (long long)be64_to_cpu(group->last_req_tid));
group->state = group->prev_state;
goto process_requests;
}
@@ -665,7 +668,7 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work)
if (!list_empty(&group->pending_list))
req = list_first_entry(&group->pending_list,
struct mcast_req, group_list);
- if (method == IB_MGMT_METHOD_GET_RESP) {
+ if ((method == IB_MGMT_METHOD_GET_RESP)) {
if (req) {
send_reply_to_slave(req->func, group, &req->sa_mad, status);
--group->func[req->func].num_pend_reqs;
@@ -752,8 +755,8 @@ static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx
if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
group->rec.mgid = *new_mgid;
sprintf(group->name, "%016llx%016llx",
- (long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix),
- (long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id));
+ (long long)be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+ (long long)be64_to_cpu(group->rec.mgid.global.interface_id));
list_del_init(&group->mgid0_list);
cur_group = mcast_insert(ctx, group);
if (cur_group) {
@@ -834,8 +837,10 @@ static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
mutex_init(&group->lock);
sprintf(group->name, "%016llx%016llx",
- (long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix),
- (long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id));
+ (long long)be64_to_cpu(
+ group->rec.mgid.global.subnet_prefix),
+ (long long)be64_to_cpu(
+ group->rec.mgid.global.interface_id));
sysfs_attr_init(&group->dentry.attr);
group->dentry.show = sysfs_show_group;
group->dentry.store = NULL;
@@ -871,7 +876,7 @@ static void queue_req(struct mcast_req *req)
list_add_tail(&req->group_list, &group->pending_list);
list_add_tail(&req->func_list, &group->func[req->func].pending);
/* calls mlx4_ib_mcg_work_handler */
- queue_work(group->demux->mcg_wq, &group->work);
+ if (!queue_work(group->demux->mcg_wq, &group->work))
safe_atomic_dec(&group->refcount);
}
@@ -907,7 +912,7 @@ int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
group->state = MCAST_RESP_READY;
/* calls mlx4_ib_mcg_work_handler */
atomic_inc(&group->refcount);
- queue_work(ctx->mcg_wq, &group->work);
+ if (!queue_work(ctx->mcg_wq, &group->work))
safe_atomic_dec(&group->refcount);
mutex_unlock(&group->lock);
release_group(group, 0);
@@ -998,13 +1003,14 @@ static ssize_t sysfs_show_group(struct device *dev,
else
sprintf(state_str, "%s(TID=0x%llx)",
get_state_string(group->state),
- (long long unsigned int)be64_to_cpu(group->last_req_tid));
+ (long long)be64_to_cpu(group->last_req_tid));
if (list_empty(&group->pending_list)) {
sprintf(pending_str, "No");
} else {
req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
sprintf(pending_str, "Yes(TID=0x%llx)",
- (long long unsigned int)be64_to_cpu(req->sa_mad.mad_hdr.tid));
+ (long long)be64_to_cpu(
+ req->sa_mad.mad_hdr.tid));
}
len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ",
group->rec.scope_join_state & 0xf,
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c
new file mode 100644
index 0000000..b6a6962
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+#include "mlx4_exp.h"
+#include <linux/mlx4/qp.h>
+
+int mlx4_ib_exp_query_device(struct ib_device *ibdev,
+ struct ib_exp_device_attr *props)
+{
+ struct ib_device_attr *base = &props->base;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int ret = mlx4_ib_query_device(ibdev, &props->base);
+
+ props->exp_comp_mask = IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ;
+ props->inline_recv_sz = dev->dev->caps.max_rq_sg * sizeof(struct mlx4_wqe_data_seg);
+ props->device_cap_flags2 = 0;
+
+ /* move RSS device cap from device_cap to device_cap_flags2 */
+ if (base->device_cap_flags & IB_DEVICE_QPG) {
+ props->device_cap_flags2 |= IB_EXP_DEVICE_QPG;
+ if (base->device_cap_flags & IB_DEVICE_UD_RSS)
+ props->device_cap_flags2 |= IB_EXP_DEVICE_UD_RSS;
+ }
+ base->device_cap_flags &= ~(IB_DEVICE_QPG |
+ IB_DEVICE_UD_RSS |
+ IB_DEVICE_UD_TSS);
+
+ if (base->max_rss_tbl_sz > 0) {
+ props->max_rss_tbl_sz = base->max_rss_tbl_sz;
+ props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ;
+ } else {
+ props->max_rss_tbl_sz = 0;
+ props->exp_comp_mask &= ~IB_EXP_DEVICE_ATTR_RSS_TBL_SZ;
+ }
+
+ if (props->device_cap_flags2)
+ props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2;
+
+ return ret;
+}
+
+/*
+ * Experimental functions
+ */
+struct ib_qp *mlx4_ib_exp_create_qp(struct ib_pd *pd,
+ struct ib_exp_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ int rwqe_size;
+ struct ib_qp *qp;
+ struct mlx4_ib_qp *mqp;
+ int use_inlr;
+ struct mlx4_ib_dev *dev;
+
+ if (init_attr->max_inl_recv && !udata)
+ return ERR_PTR(-EINVAL);
+
+ use_inlr = mlx4_ib_qp_has_rq((struct ib_qp_init_attr *)init_attr) &&
+ init_attr->max_inl_recv && pd;
+ if (use_inlr) {
+ rwqe_size = roundup_pow_of_two(max(1U, init_attr->cap.max_recv_sge)) *
+ sizeof(struct mlx4_wqe_data_seg);
+ if (rwqe_size < init_attr->max_inl_recv) {
+ dev = to_mdev(pd->device);
+ init_attr->max_inl_recv = min(init_attr->max_inl_recv,
+ (u32)(dev->dev->caps.max_rq_sg *
+ sizeof(struct mlx4_wqe_data_seg)));
+ init_attr->cap.max_recv_sge = roundup_pow_of_two(init_attr->max_inl_recv) /
+ sizeof(struct mlx4_wqe_data_seg);
+ }
+ } else {
+ init_attr->max_inl_recv = 0;
+ }
+ qp = mlx4_ib_create_qp(pd, (struct ib_qp_init_attr *)init_attr, udata);
+ if (IS_ERR(qp))
+ return qp;
+
+ if (use_inlr) {
+ mqp = to_mqp(qp);
+ mqp->max_inlr_data = 1 << mqp->rq.wqe_shift;
+ init_attr->max_inl_recv = mqp->max_inlr_data;
+ }
+
+ return qp;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h
new file mode 100644
index 0000000..58675a4
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_EXP_H
+#define MLX4_EXP_H
+
+#include <rdma/ib_verbs_exp.h>
+#include "mlx4_ib.h"
+
+struct ib_qp *mlx4_ib_exp_create_qp(struct ib_pd *pd,
+ struct ib_exp_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_exp_query_device(struct ib_device *ibdev,
+ struct ib_exp_device_attr *props);
+
+#endif /* MLX4_EXP_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 2435df5..ddf5236 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -38,6 +38,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/idr.h>
+#include <linux/rbtree.h>
#include <linux/notifier.h>
#include <rdma/ib_verbs.h>
@@ -47,7 +48,6 @@
#include <linux/mlx4/device.h>
#include <linux/mlx4/doorbell.h>
-#include <linux/rbtree.h>
#define MLX4_IB_DRV_NAME "mlx4_ib"
@@ -72,9 +72,7 @@ enum {
/*module param to indicate if SM assigns the alias_GUID*/
extern int mlx4_ib_sm_guid_assign;
-#ifdef __linux__
extern struct proc_dir_entry *mlx4_mrs_dir_entry;
-#endif
#define MLX4_IB_UC_STEER_QPN_ALIGN 1
#define MLX4_IB_UC_MAX_NUM_QPS (256 * 1024)
@@ -128,6 +126,7 @@ struct mlx4_ib_cq {
struct mutex resize_mutex;
struct ib_umem *umem;
struct ib_umem *resize_umem;
+ int create_flags;
};
struct mlx4_ib_mr {
@@ -135,6 +134,13 @@ struct mlx4_ib_mr {
struct mlx4_mr mmr;
struct ib_umem *umem;
struct mlx4_shared_mr_info *smr_info;
+ atomic_t invalidated;
+ struct completion invalidation_comp;
+};
+
+struct mlx4_ib_mw {
+ struct ib_mw ibmw;
+ struct mlx4_mw mmw;
};
struct mlx4_ib_fast_reg_page_list {
@@ -148,6 +154,12 @@ struct mlx4_ib_fmr {
struct mlx4_fmr mfmr;
};
+struct mlx4_ib_flow {
+ struct ib_flow ibflow;
+ /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */
+ u64 reg_id[2];
+};
+
struct mlx4_ib_wq {
u64 *wrid;
spinlock_t lock;
@@ -163,6 +175,9 @@ struct mlx4_ib_wq {
enum mlx4_ib_qp_flags {
MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+ MLX4_IB_QP_CAP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL,
+ MLX4_IB_QP_CAP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND,
+ MLX4_IB_QP_CAP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV,
MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
MLX4_IB_SRIOV_SQP = 1 << 31,
@@ -179,6 +194,7 @@ enum mlx4_ib_mmap_cmd {
MLX4_IB_MMAP_UAR_PAGE = 0,
MLX4_IB_MMAP_BLUE_FLAME_PAGE = 1,
MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES = 2,
+ MLX4_IB_MMAP_GET_HW_CLOCK = 3,
};
enum mlx4_ib_qp_type {
@@ -319,8 +335,14 @@ struct mlx4_ib_qp {
struct mlx4_roce_smac_vlan_info pri;
struct mlx4_roce_smac_vlan_info alt;
struct list_head rules_list;
+ u64 reg_id;
int max_inline_data;
struct mlx4_bf bf;
+
+ /*
+ * Experimental data
+ */
+ int max_inlr_data;
};
struct mlx4_ib_srq {
@@ -354,6 +376,12 @@ struct mlx4_ib_ah {
#define MLX4_NOT_SET_GUID (0x00LL)
#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL))
+/****************************************/
+/* ioctl codes */
+/****************************************/
+#define MLX4_IOC_MAGIC 'm'
+#define MLX4_IOCHWCLOCKOFFSET _IOR(MLX4_IOC_MAGIC, 1, int)
+
enum mlx4_guid_alias_rec_status {
MLX4_GUID_INFO_STATUS_IDLE,
MLX4_GUID_INFO_STATUS_SET,
@@ -478,7 +506,9 @@ struct mlx4_ib_sriov {
struct mlx4_ib_iboe {
spinlock_t lock;
struct net_device *netdevs[MLX4_MAX_PORTS];
+ struct net_device *masters[MLX4_MAX_PORTS];
struct notifier_block nb;
+ struct notifier_block nb_inet;
union ib_gid gid_table[MLX4_MAX_PORTS][128];
};
@@ -518,6 +548,11 @@ struct mlx4_ib_iov_port {
struct mlx4_ib_iov_sysfs_attr mcg_dentry;
};
+struct mlx4_ib_counter {
+ int counter_index;
+ int status;
+};
+
struct mlx4_ib_dev {
struct ib_device ib_dev;
struct mlx4_dev *dev;
@@ -534,7 +569,7 @@ struct mlx4_ib_dev {
struct mutex cap_mask_mutex;
bool ib_active;
struct mlx4_ib_iboe iboe;
- int counters[MLX4_MAX_PORTS];
+ struct mlx4_ib_counter counters[MLX4_MAX_PORTS];
int *eq_table;
int eq_added;
struct kobject *iov_parent;
@@ -595,6 +630,11 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
return container_of(ibmr, struct mlx4_ib_mr, ibmr);
}
+static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw)
+{
+ return container_of(ibmw, struct mlx4_ib_mw, ibmw);
+}
+
static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
{
return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl);
@@ -604,6 +644,12 @@ static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
{
return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
}
+
+static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow)
+{
+ return container_of(ibflow, struct mlx4_ib_flow, ibflow);
+}
+
static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
{
return container_of(ibqp, struct mlx4_ib_qp, ibqp);
@@ -646,16 +692,23 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata, int mr_id);
int mlx4_ib_dereg_mr(struct ib_mr *mr);
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind);
+int mlx4_ib_dealloc_mw(struct ib_mw *mw);
struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
int max_page_list_len);
struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
int page_list_len);
void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
-int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx4_ib_modify_cq(struct ib_cq *cq,
+ struct ib_cq_attr *cq_attr,
+ int cq_attr_mask);
int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq);
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
+ struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
struct ib_udata *udata);
int mlx4_ib_destroy_cq(struct ib_cq *cq);
@@ -730,6 +783,13 @@ static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
return !!(ah->av.ib.g_slid & 0x80);
}
+static inline int mlx4_ib_qp_has_rq(struct ib_qp_init_attr *attr)
+{
+ if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+ return 0;
+
+ return !attr->srq;
+}
int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
@@ -757,7 +817,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
struct ib_grh *grh, struct ib_mad *mad);
int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
- u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad);
+ u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, struct ib_mad *mad);
__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
@@ -799,5 +859,7 @@ int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
int is_attach);
+int mlx4_ib_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props);
#endif /* MLX4_IB_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
index 9ea4901..61c2088 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
@@ -35,11 +35,6 @@
#include <linux/module.h>
#include <linux/sched.h>
-#ifdef __linux__
-#include <linux/proc_fs.h>
-#include <linux/cred.h>
-#endif
-
#include "mlx4_ib.h"
static u32 convert_access(int acc)
@@ -48,9 +43,11 @@ static u32 convert_access(int acc)
(acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) |
(acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) |
(acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) |
+ (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) |
MLX4_PERM_LOCAL_READ;
}
-#ifdef __linux__
+/* No suuport for Shared MR feature */
+#if 0
static ssize_t shared_mr_proc_read(struct file *file,
char __user *buffer,
size_t len,
@@ -129,7 +126,7 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
return &mr->ibmr;
err_mr:
- mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+ (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
err_free:
kfree(mr);
@@ -159,7 +156,7 @@ static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
if (len & (mtt_size-1ULL)) {
WARN(1 ,
"write_block: len %llx is not aligned to mtt_size %llx\n",
- (long long)len, (long long)mtt_size);
+ (unsigned long long)len, (unsigned long long)mtt_size);
return -EINVAL;
}
@@ -203,8 +200,6 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
struct ib_umem *umem)
{
u64 *pages;
- struct ib_umem_chunk *chunk;
- int j;
u64 len = 0;
int err = 0;
u64 mtt_size;
@@ -212,6 +207,8 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
u64 mtt_shift;
int start_index = 0;
int npages = 0;
+ struct scatterlist *sg;
+ int i;
pages = (u64 *) __get_free_page(GFP_KERNEL);
if (!pages)
@@ -220,12 +217,11 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
mtt_shift = mtt->page_shift;
mtt_size = 1ULL << mtt_shift;
- list_for_each_entry(chunk, &umem->chunk_list, list)
- for (j = 0; j < chunk->nmap; ++j) {
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
if (cur_start_addr + len ==
- sg_dma_address(&chunk->page_list[j])) {
+ sg_dma_address(sg)) {
/* still the same block */
- len += sg_dma_len(&chunk->page_list[j]);
+ len += sg_dma_len(sg);
continue;
}
/* A new block is started ...*/
@@ -242,8 +238,8 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
goto out;
cur_start_addr =
- sg_dma_address(&chunk->page_list[j]);
- len = sg_dma_len(&chunk->page_list[j]);
+ sg_dma_address(sg);
+ len = sg_dma_len(sg);
}
/* Handle the last block */
@@ -319,8 +315,6 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
u64 start_va,
int *num_of_mtts)
{
- struct ib_umem_chunk *chunk;
- int j;
u64 block_shift = MLX4_MAX_MTT_SHIFT;
u64 current_block_len = 0;
u64 current_block_start = 0;
@@ -330,14 +324,18 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
u64 total_len = 0;
u64 last_block_aligned_end = 0;
u64 min_shift = ilog2(umem->page_size);
+ struct scatterlist *sg;
+ int i;
+ u64 next_block_start;
+ u64 current_block_end;
- list_for_each_entry(chunk, &umem->chunk_list, list) {
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
/* Initialization - save the first chunk start as
the current_block_start - block means contiguous pages.
*/
if (current_block_len == 0 && current_block_start == 0) {
first_block_start = current_block_start =
- sg_dma_address(&chunk->page_list[0]);
+ sg_dma_address(sg);
/* Find the bits that are different between
the physical address and the virtual
address for the start of the MR.
@@ -361,13 +359,12 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
, block_shift);
}
- /* Go over the scatter entries in the current chunk, check
+ /* Go over the scatter entries and check
if they continue the previous scatter entry.
*/
- for (j = 0; j < chunk->nmap; ++j) {
- u64 next_block_start =
- sg_dma_address(&chunk->page_list[j]);
- u64 current_block_end = current_block_start
+ next_block_start =
+ sg_dma_address(sg);
+ current_block_end = current_block_start
+ current_block_len;
/* If we have a split (non-contig.) between two block*/
if (current_block_end != next_block_start) {
@@ -392,7 +389,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
/* Start a new block */
current_block_start = next_block_start;
current_block_len =
- sg_dma_len(&chunk->page_list[j]);
+ sg_dma_len(sg);
continue;
}
/* The scatter entry is another part of
@@ -402,8 +399,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
which merge some blocks together.
*/
current_block_len +=
- sg_dma_len(&chunk->page_list[j]);
- }
+ sg_dma_len(sg);
}
/* Account for the last block in the total len */
@@ -416,7 +412,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
WARN((total_len & ((1ULL<<block_shift)-1ULL)),
" misaligned total length detected (%llu, %llu)!",
- (long long)total_len, (long long)block_shift);
+ (unsigned long long)total_len, (unsigned long long)block_shift);
*num_of_mtts = total_len >> block_shift;
end:
@@ -426,16 +422,19 @@ end:
*/
WARN(1,
"mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
- (long long)block_shift);
+ (unsigned long long)block_shift);
block_shift = min_shift;
}
return block_shift;
+
}
-#ifdef __linux__
+/* No suuport for Shared MR */
+#if 0
static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
{
+
struct proc_dir_entry *mr_proc_entry;
mode_t mode = S_IFREG;
char name_buff[16];
@@ -475,8 +474,51 @@ static int is_shared_mr(int access_flags)
IB_ACCESS_SHARED_MR_OTHER_WRITE));
}
+
+static void free_smr_info(struct mlx4_ib_mr *mr)
+{
+ /* When master/parent shared mr is dereged there is
+ no ability to share this mr any more - its mr_id will be
+ returned to the kernel as part of ib_uverbs_dereg_mr
+ and may be allocated again as part of other reg_mr.
+ */
+ char name_buff[16];
+
+ sprintf(name_buff, "%X", mr->smr_info->mr_id);
+ /* Remove proc entry is checking internally that no operation
+ was strated on that proc fs file and if in the middle
+ current process will wait till end of operation.
+ That's why no sync mechanism is needed when we release
+ below the shared umem.
+ */
+ remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
+ kfree(mr->smr_info);
+ mr->smr_info = NULL;
+}
#endif
+static void mlx4_invalidate_umem(void *invalidation_cookie,
+ struct ib_umem *umem,
+ unsigned long addr, size_t size)
+{
+ struct mlx4_ib_mr *mr = (struct mlx4_ib_mr *)invalidation_cookie;
+
+ /* This function is called under client peer lock so its resources are race protected */
+ if (atomic_inc_return(&mr->invalidated) > 1) {
+ umem->invalidation_ctx->inflight_invalidation = 1;
+ goto end;
+ }
+
+ umem->invalidation_ctx->peer_callback = 1;
+ mlx4_mr_free(to_mdev(mr->ibmr.device)->dev, &mr->mmr);
+ ib_umem_release(umem);
+ complete(&mr->invalidation_comp);
+
+end:
+ return;
+
+}
+
struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata,
@@ -487,18 +529,20 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
int shift;
int err;
int n;
+ struct ib_peer_memory_client *ib_peer_mem;
mr = kzalloc(sizeof *mr, GFP_KERNEL);
if (!mr)
return ERR_PTR(-ENOMEM);
- mr->umem = ib_umem_get(pd->uobject->context, start, length,
- access_flags, 0);
+ mr->umem = ib_umem_get_ex(pd->uobject->context, start, length,
+ access_flags, 0, 1);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
goto err_free;
}
+ ib_peer_mem = mr->umem->ib_peer_mem;
n = ib_umem_page_count(mr->umem);
shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
&n);
@@ -516,7 +560,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
goto err_mr;
mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
-#ifdef __linux__
+/* No suuport for Shared MR */
+#if 0
/* Check whether MR should be shared */
if (is_shared_mr(access_flags)) {
/* start address and length must be aligned to page size in order
@@ -531,10 +576,32 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
goto err_mr;
}
#endif
+ if (ib_peer_mem) {
+ if (access_flags & IB_ACCESS_MW_BIND) {
+ /* Prevent binding MW on peer clients.
+ * mlx4_invalidate_umem must be void,
+ * therefore, mlx4_mr_free should not fail
+ * when using peer clients. */
+ err = -ENOSYS;
+ pr_err("MW is not supported with peer memory client");
+ goto err_smr;
+ }
+ init_completion(&mr->invalidation_comp);
+ ib_umem_activate_invalidation_notifier(mr->umem,
+ mlx4_invalidate_umem, mr);
+ }
+
+ atomic_set(&mr->invalidated, 0);
return &mr->ibmr;
+err_smr:
+/* No suuport for Shared MR */
+#if 0
+ if (mr->smr_info)
+ free_smr_info(mr);
+#endif
err_mr:
- mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+ (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
err_umem:
ib_umem_release(mr->umem);
@@ -545,41 +612,106 @@ err_free:
return ERR_PTR(err);
}
-
int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx4_ib_mr *mr = to_mmr(ibmr);
+ struct ib_umem *umem = mr->umem;
+ int ret;
- mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
- if (mr->smr_info) {
- /* When master/parent shared mr is dereged there is
- no ability to share this mr any more - its mr_id will be
- returned to the kernel as part of ib_uverbs_dereg_mr
- and may be allocated again as part of other reg_mr.
- */
- char name_buff[16];
-
- sprintf(name_buff, "%X", mr->smr_info->mr_id);
- /* Remove proc entry is checking internally that no operation
- was strated on that proc fs file and if in the middle
- current process will wait till end of operation.
- That's why no sync mechanism is needed when we release
- below the shared umem.
- */
-#ifdef __linux__
- remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
- kfree(mr->smr_info);
+/* No suuport for Shared MR */
+#if 0
+ if (mr->smr_info)
+ free_smr_info(mr);
#endif
+
+ if (atomic_inc_return(&mr->invalidated) > 1) {
+ wait_for_completion(&mr->invalidation_comp);
+ goto end;
+ }
+
+ ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+ if (ret) {
+ /* Error is not expected here, except when memory windows
+ * are bound to MR which is not supported with
+ * peer memory clients */
+ atomic_set(&mr->invalidated, 0);
+ return ret;
}
- if (mr->umem)
+ if (!umem)
+ goto end;
+
ib_umem_release(mr->umem);
+end:
kfree(mr);
return 0;
}
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_mw *mw;
+ int err;
+
+ mw = kmalloc(sizeof(*mw), GFP_KERNEL);
+ if (!mw)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, (enum mlx4_mw_type)type, &mw->mmw);
+ if (err)
+ goto err_free;
+
+ err = mlx4_mw_enable(dev->dev, &mw->mmw);
+ if (err)
+ goto err_mw;
+
+ mw->ibmw.rkey = mw->mmw.key;
+
+ return &mw->ibmw;
+
+err_mw:
+ mlx4_mw_free(dev->dev, &mw->mmw);
+
+err_free:
+ kfree(mw);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind)
+{
+ struct ib_send_wr wr;
+ struct ib_send_wr *bad_wr;
+ int ret;
+
+ memset(&wr, 0, sizeof(wr));
+ wr.opcode = IB_WR_BIND_MW;
+ wr.wr_id = mw_bind->wr_id;
+ wr.send_flags = mw_bind->send_flags;
+ wr.wr.bind_mw.mw = mw;
+ wr.wr.bind_mw.bind_info = mw_bind->bind_info;
+ wr.wr.bind_mw.rkey = ib_inc_rkey(mw->rkey);
+
+ ret = mlx4_ib_post_send(qp, &wr, &bad_wr);
+ if (!ret)
+ mw->rkey = wr.wr.bind_mw.rkey;
+
+ return ret;
+}
+
+int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
+{
+ struct mlx4_ib_mw *mw = to_mmw(ibmw);
+
+ mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
+ kfree(mw);
+
+ return 0;
+}
+
struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
int max_page_list_len)
{
@@ -606,7 +738,7 @@ struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
return &mr->ibmr;
err_mr:
- mlx4_mr_free(dev->dev, &mr->mmr);
+ (void) mlx4_mr_free(dev->dev, &mr->mmr);
err_free:
kfree(mr);
@@ -685,7 +817,7 @@ struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
return &fmr->ibfmr;
err_mr:
- mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+ (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
err_free:
kfree(fmr);
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
index c5ebe6b..b3d9695 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
@@ -45,13 +45,11 @@
#include <linux/mlx4/driver.h>
#include <linux/io.h>
-#ifndef __linux__
-#define asm __asm
-#endif
-
#include "mlx4_ib.h"
#include "user.h"
+#define asm __asm
+
enum {
MLX4_IB_ACK_REQ_FREQ = 8,
};
@@ -111,6 +109,8 @@ static const __be32 mlx4_ib_opcode[] = {
[IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
[IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+ [IB_WR_BIND_MW] = cpu_to_be32(
+ MLX4_OPCODE_BIND_MW),
};
#ifndef wc_wmb
@@ -263,7 +263,7 @@ static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
/* Pad the remainder of the WQE with an inline data segment. */
if (size > s) {
inl = wqe + s;
- inl->byte_count = cpu_to_be32(1U << 31 | (size - s - sizeof *inl));
+ inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
}
ctrl->srcrb_flags = 0;
ctrl->fence_size = size / 16;
@@ -274,7 +274,7 @@ static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
wmb();
ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
- (n & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0);
+ (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
}
@@ -573,6 +573,12 @@ static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
sizeof (struct mlx4_ib_proxy_sqp_hdr),
DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(dev,
+ qp->sqp_proxy_rcv[i].map))) {
+ pr_warn("ib_dma_map_single failed\n");
+ kfree(qp->sqp_proxy_rcv[i].addr);
+ goto err;
+ }
}
return 0;
@@ -602,15 +608,6 @@ static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
kfree(qp->sqp_proxy_rcv);
}
-static int qp_has_rq(struct ib_qp_init_attr *attr)
-{
- if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
- return 0;
-
- return !attr->srq;
-}
-
-#ifdef __linux__
static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp,
struct ib_qp_init_attr *attr, int *qpn)
{
@@ -644,7 +641,7 @@ static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp,
err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base);
else
err = mlx4_qp_reserve_range(dev->dev, tss_align_num,
- tss_align_num, &tss_base, 1);
+ tss_align_num, &tss_base, MLX4_RESERVE_BF_QP);
if (err)
goto err1;
@@ -791,7 +788,6 @@ static void free_qpg_qpn(struct mlx4_ib_qp *mqp, int qpn)
break;
}
}
-#endif
static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
struct ib_qp_init_attr *attr, int *qpn)
@@ -800,10 +796,12 @@ static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
switch (attr->qpg_type) {
case IB_QPG_NONE:
- /* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
- * BlueFlame setup flow wrongly causes VLAN insertion. */
+ /* Raw packet QPNs may not have bits 6,7 set in their qp_num;
+ * otherwise, the WQE BlueFlame setup flow wrongly causes
+ * VLAN insertion. */
if (attr->qp_type == IB_QPT_RAW_PACKET) {
- err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 1);
+ err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn,
+ MLX4_RESERVE_BF_QP);
} else {
if(qp->flags & MLX4_IB_QP_NETIF)
err = mlx4_ib_steer_qp_alloc(dev, 1, qpn);
@@ -812,15 +810,11 @@ static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
}
break;
case IB_QPG_PARENT:
-#ifdef __linux__
err = init_qpg_parent(dev, qp, attr, qpn);
-#endif
break;
case IB_QPG_CHILD_TX:
case IB_QPG_CHILD_RX:
-#ifdef __linux__
err = alloc_qpg_qpn(attr, qp, qpn);
-#endif
break;
default:
qp->qpg_type = IB_QPG_NONE;
@@ -844,15 +838,11 @@ static void free_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
mlx4_qp_release_range(dev->dev, qpn, 1);
break;
case IB_QPG_PARENT:
-#ifdef __linux__
free_qpg_parent(dev, qp);
-#endif
break;
case IB_QPG_CHILD_TX:
case IB_QPG_CHILD_RX:
-#ifdef __linux__
free_qpg_qpn(qp, qpn);
-#endif
break;
default:
break;
@@ -881,10 +871,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
struct mlx4_ib_qp *qp;
enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
-#ifndef __linux__
- init_attr->qpg_type = IB_QPG_NONE;
-#endif
-
/* When tunneling special qps, we use a plain UD qp */
if (sqpn) {
if (mlx4_is_mfunc(dev->dev) &&
@@ -941,6 +927,23 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
qp->mlx4_ib_qp_type = qp_type;
+ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+ if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+ qp->flags |= MLX4_IB_QP_LSO;
+
+ if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+ if (dev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED &&
+ !mlx4_is_mfunc(dev->dev))
+ qp->flags |= MLX4_IB_QP_NETIF;
+ else {
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
mutex_init(&qp->mutex);
spin_lock_init(&qp->sq.lock);
spin_lock_init(&qp->rq.lock);
@@ -952,7 +955,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
- err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp);
+ err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, mlx4_ib_qp_has_rq(init_attr), qp);
if (err)
goto err;
@@ -961,11 +964,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
int shift;
int n;
- if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ if (!udata || ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
err = -EFAULT;
goto err;
}
+ if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL)
+ qp->flags |= MLX4_IB_QP_CAP_CROSS_CHANNEL;
+
+ if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND)
+ qp->flags |= MLX4_IB_QP_CAP_MANAGED_SEND;
+
+ if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
+ qp->flags |= MLX4_IB_QP_CAP_MANAGED_RECV;
+
qp->sq_no_prefetch = ucmd.sq_no_prefetch;
err = set_user_sq_size(dev, qp, &ucmd);
@@ -990,7 +1002,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
if (err)
goto err_mtt;
- if (qp_has_rq(init_attr)) {
+ if (mlx4_ib_qp_has_rq(init_attr)) {
err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
ucmd.db_addr, &qp->db);
if (err)
@@ -999,23 +1011,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
} else {
qp->sq_no_prefetch = 0;
- if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
- qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
-
- if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
- qp->flags |= MLX4_IB_QP_LSO;
-
- if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP &&
- dev->dev->caps.steering_mode ==
- MLX4_STEERING_MODE_DEVICE_MANAGED &&
- !mlx4_is_mfunc(dev->dev))
- qp->flags |= MLX4_IB_QP_NETIF;
-
err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
if (err)
goto err;
- if (qp_has_rq(init_attr)) {
+ if (mlx4_ib_qp_has_rq(init_attr)) {
err = mlx4_db_alloc(dev->dev, &qp->db, 0);
if (err)
goto err;
@@ -1097,7 +1097,7 @@ err_proxy:
free_proxy_bufs(pd->device, qp);
err_wrid:
if (pd->uobject) {
- if (qp_has_rq(init_attr))
+ if (mlx4_ib_qp_has_rq(init_attr))
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
} else {
kfree(qp->sq.wrid);
@@ -1114,7 +1114,7 @@ err_buf:
mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
err_db:
- if (!pd->uobject && qp_has_rq(init_attr))
+ if (!pd->uobject && mlx4_ib_qp_has_rq(init_attr))
mlx4_db_free(dev->dev, &qp->db);
if (qp->max_inline_data)
@@ -1145,7 +1145,7 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv
{
if (send_cq == recv_cq) {
spin_lock_irq(&send_cq->lock);
- (void) __acquire(&recv_cq->lock);
+ __acquire(&recv_cq->lock);
} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
spin_lock_irq(&send_cq->lock);
spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
@@ -1159,7 +1159,7 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re
__releases(&send_cq->lock) __releases(&recv_cq->lock)
{
if (send_cq == recv_cq) {
- (void) __release(&recv_cq->lock);
+ __release(&recv_cq->lock);
spin_unlock_irq(&send_cq->lock);
} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
spin_unlock(&recv_cq->lock);
@@ -1300,14 +1300,14 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
return dev->dev->caps.qp1_proxy[attr->port_num - 1];
}
-#ifdef __linux__
static int check_qpg_attr(struct mlx4_ib_dev *dev,
struct ib_qp_init_attr *attr)
{
if (attr->qpg_type == IB_QPG_NONE)
return 0;
- if (attr->qp_type != IB_QPT_UD)
+ if (attr->qp_type != IB_QPT_UD &&
+ attr->qp_type != IB_QPT_RAW_PACKET)
return -EINVAL;
if (attr->qpg_type == IB_QPG_PARENT) {
@@ -1346,7 +1346,6 @@ static int check_qpg_attr(struct mlx4_ib_dev *dev,
}
return 0;
}
-#endif
#define RESERVED_FLAGS_MASK ((((unsigned int)IB_QP_CREATE_RESERVED_END - 1) | IB_QP_CREATE_RESERVED_END) \
& ~(IB_QP_CREATE_RESERVED_START - 1))
@@ -1364,6 +1363,15 @@ static enum mlx4_ib_qp_flags to_mlx4_ib_qp_flags(enum ib_qp_create_flags ib_qp_f
if (ib_qp_flags & IB_QP_CREATE_NETIF_QP)
mlx4_ib_qp_flags |= MLX4_IB_QP_NETIF;
+ if (ib_qp_flags & IB_QP_CREATE_CROSS_CHANNEL)
+ mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_CROSS_CHANNEL;
+
+ if (ib_qp_flags & IB_QP_CREATE_MANAGED_SEND)
+ mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_SEND;
+
+ if (ib_qp_flags & IB_QP_CREATE_MANAGED_RECV)
+ mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_RECV;
+
/* reserved flags */
mlx4_ib_qp_flags |= (ib_qp_flags & RESERVED_FLAGS_MASK);
@@ -1387,6 +1395,9 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
* and only for kernel UD QPs.
*/
if (mlx4_qp_flags & ~(MLX4_IB_QP_LSO |
+ MLX4_IB_QP_CAP_CROSS_CHANNEL |
+ MLX4_IB_QP_CAP_MANAGED_SEND |
+ MLX4_IB_QP_CAP_MANAGED_RECV |
MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP |
MLX4_IB_QP_NETIF))
@@ -1397,19 +1408,30 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
return ERR_PTR(-EINVAL);
}
- if (init_attr->create_flags &&
- (udata ||
- ((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) &&
+ if ((mlx4_qp_flags &
+ (MLX4_IB_QP_CAP_CROSS_CHANNEL |
+ MLX4_IB_QP_CAP_MANAGED_SEND |
+ MLX4_IB_QP_CAP_MANAGED_RECV)) &&
+ !(to_mdev(device)->dev->caps.flags &
+ MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)) {
+ pr_debug("%s Does not support cross-channel operations\n",
+ to_mdev(device)->ib_dev.name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if ((init_attr->create_flags &
+ ~(IB_QP_CREATE_CROSS_CHANNEL |
+ IB_QP_CREATE_MANAGED_SEND |
+ IB_QP_CREATE_MANAGED_RECV)) &&
+ (((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) &&
init_attr->qp_type != IB_QPT_UD) ||
((mlx4_qp_flags & MLX4_IB_SRIOV_SQP) &&
init_attr->qp_type > IB_QPT_GSI)))
return ERR_PTR(-EINVAL);
-#ifdef __linux__
err = check_qpg_attr(to_mdev(device), init_attr);
if (err)
return ERR_PTR(err);
-#endif
switch (init_attr->qp_type) {
case IB_QPT_XRC_TGT:
@@ -1559,32 +1581,42 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
}
+static int ib_rate_to_mlx4(struct mlx4_ib_dev *dev, u8 rate)
+{
+ if (rate == IB_RATE_PORT_CURRENT) {
+ return 0;
+ } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) {
+ return -EINVAL;
+ } else {
+ while (rate != IB_RATE_2_5_GBPS &&
+ !(1 << (rate + MLX4_STAT_RATE_OFFSET) &
+ dev->dev->caps.stat_rate_support))
+ --rate;
+ }
+
+ return rate + MLX4_STAT_RATE_OFFSET;
+}
+
static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
- struct mlx4_ib_qp *qp, struct mlx4_qp_path *path,
- u8 port, int is_primary)
+ u8 *smac, u16 vlan_id, struct mlx4_ib_qp *qp,
+ struct mlx4_qp_path *path, u8 port, int is_primary)
{
- struct net_device *ndev;
- int err;
int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
IB_LINK_LAYER_ETHERNET;
- u8 mac[6];
- int is_mcast;
u16 vlan_tag;
int vidx;
int smac_index;
+ int err;
u64 u64_mac;
- u8 *smac;
struct mlx4_roce_smac_vlan_info *smac_info;
path->grh_mylmc = ah->src_path_bits & 0x7f;
path->rlid = cpu_to_be16(ah->dlid);
- if (ah->static_rate) {
- path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
- while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
- !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
- --path->static_rate;
- } else
- path->static_rate = 0;
+
+ err = ib_rate_to_mlx4(dev, ah->static_rate);
+ if (err < 0)
+ return err;
+ path->static_rate = err;
if (ah->ah_flags & IB_AH_GRH) {
if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
@@ -1614,7 +1646,7 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
else
smac_info = &qp->alt;
- vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]);
+ vlan_tag = vlan_id;
if (vlan_tag < 0x1000) {
if (smac_info->vid < 0x1000) {
/* both valid vlan ids */
@@ -1653,28 +1685,13 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
}
}
- err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port);
- if (err)
- return err;
/* get smac_index for RoCE use.
* If no smac was yet assigned, register one.
* If one was already assigned, but the new mac differs,
* unregister the old one and register the new one.
*/
- spin_lock(&dev->iboe.lock);
- ndev = dev->iboe.netdevs[port - 1];
- if (ndev) {
-#ifdef __linux__
- smac = ndev->dev_addr; /* fixme: cache this value */
-#else
- smac = IF_LLADDR(ndev); /* fixme: cache this value */
-#endif
-
u64_mac = mlx4_mac_to_u64(smac);
- } else
- u64_mac = dev->dev->caps.def_mac[port];
- spin_unlock(&dev->iboe.lock);
if (!smac_info->smac || smac_info->smac != u64_mac) {
/* register candidate now, unreg if needed, after success */
@@ -1688,7 +1705,7 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
} else
smac_index = smac_info->smac_index;
- memcpy(path->dmac, mac, 6);
+ memcpy(path->dmac, ah->dmac, 6);
path->ackto = MLX4_IB_LINK_TYPE_ETH;
/* put MAC table smac index for IBoE */
path->grh_mylmc = (u8) (smac_index) | 0x80 ;
@@ -1712,24 +1729,21 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
}
}
-static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, const u8 *smac,
struct mlx4_qp_context *context)
{
struct net_device *ndev;
u64 u64_mac;
- u8 *smac;
int smac_index;
+
ndev = dev->iboe.netdevs[qp->port - 1];
if (ndev) {
-#ifdef __linux__
- smac = ndev->dev_addr; /* fixme: cache this value */
-#else
- smac = IF_LLADDR(ndev); /* fixme: cache this value */
-#endif
+ smac = IF_LLADDR(ndev);
u64_mac = mlx4_mac_to_u64(smac);
- } else
+ } else {
u64_mac = dev->dev->caps.def_mac[qp->port];
+ }
context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
if (!qp->pri.smac) {
@@ -1783,6 +1797,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
}
}
+ if (qp->max_inlr_data)
+ context->param3 |= cpu_to_be32(1 << 25);
+
if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
@@ -1834,12 +1851,13 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
}
if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
- if (dev->counters[qp->port - 1] != -1) {
+ if (dev->counters[qp->port - 1].counter_index != -1) {
context->pri_path.counter_index =
- dev->counters[qp->port - 1];
+ dev->counters[qp->port - 1].counter_index;
optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
- } else
+ } else {
context->pri_path.counter_index = 0xff;
+ }
if (qp->flags & MLX4_IB_QP_NETIF &&
(qp->qpg_type == IB_QPG_NONE || qp->qpg_type == IB_QPG_PARENT)) {
@@ -1855,8 +1873,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
}
- if (attr_mask & IB_QP_AV) {
- if (mlx4_set_path(dev, &attr->ah_attr, qp, &context->pri_path,
+ if ((attr_mask & IB_QP_AV) && (ibqp->qp_type != IB_QPT_RAW_PACKET)) {
+ if (mlx4_set_path(dev, &attr->ah_attr, (u8 *)attr->smac,
+ attr_mask & IB_QP_VID ?
+ attr->vlan_id : 0xffff ,
+ qp, &context->pri_path,
attr_mask & IB_QP_PORT ?
attr->port_num : qp->port, 1))
goto out;
@@ -1879,12 +1900,16 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
dev->dev->caps.pkey_table_len[attr->alt_port_num])
goto out;
- if (mlx4_set_path(dev, &attr->alt_ah_attr, qp, &context->alt_path,
+ if (mlx4_set_path(dev, &attr->alt_ah_attr, (u8 *)attr->smac,
+ attr_mask & IB_QP_ALT_VID ?
+ attr->alt_vlan_id : 0xffff,
+ qp, &context->alt_path,
attr->alt_port_num, 0))
goto out;
context->alt_path.pkey_index = attr->alt_pkey_index;
context->alt_path.ackto = attr->alt_timeout << 3;
+ context->alt_path.counter_index = dev->counters[attr->alt_port_num - 1].counter_index;
optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
}
@@ -1943,6 +1968,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
if (attr_mask & IB_M_EXT_CLASS_3)
context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ);
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ context->params2 |= (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL ?
+ cpu_to_be32(MLX4_QP_BIT_COLL_MASTER) : 0);
+ context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_SEND ?
+ cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_SQ) : 0);
+ context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV ?
+ cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_RQ) : 0);
+ }
+
if (ibqp->srq)
context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
@@ -1997,6 +2031,12 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
context->pri_path.fl = 0x80;
context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
}
+ if (ibqp->qp_type == IB_QPT_RAW_PACKET &&
+ (attr_mask & IB_QP_AV)) {
+ context->pri_path.sched_queue |=
+ ((attr->ah_attr.sl & 0xf) << 3);
+ context->pri_path.feup = 1 << 6;
+ }
is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
IB_LINK_LAYER_ETHERNET;
if (is_eth) {
@@ -2007,13 +2047,19 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
- err = handle_eth_ud_smac_index(dev, qp, context);
+ err = handle_eth_ud_smac_index(dev, qp, (const u8 *)attr->smac, context);
if (err)
return -EINVAL;
}
}
}
+ if (ibqp->qp_type == IB_QPT_UD)
+ if (is_eth && (new_state == IB_QPS_RTR)) {
+ context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
+ optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
+ }
+
if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD &&
attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
sqd_event = 1;
@@ -2072,7 +2118,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
for (i = 0; i < qp->sq.wqe_cnt; ++i) {
ctrl = get_send_wqe(qp, i);
- ctrl->owner_opcode = cpu_to_be32(1U << 31);
+ ctrl->owner_opcode = cpu_to_be32(1 << 31);
if (qp->sq_max_wqes_per_wr == 1)
ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
@@ -2080,6 +2126,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
}
}
+ if ((qp->port && rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET) && (qp->ibqp.qp_type == IB_QPT_RAW_PACKET))
+ context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
+ MLX4_IB_LINK_TYPE_ETH;
+
err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
to_mlx4_state(new_state), context, optpar,
sqd_event, &qp->mqp);
@@ -2268,14 +2319,22 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
struct mlx4_ib_qp *qp = to_mqp(ibqp);
enum ib_qp_state cur_state, new_state;
int err = -EINVAL;
+ int ll;
mutex_lock(&qp->mutex);
cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ ll = IB_LINK_LAYER_UNSPECIFIED;
+ } else {
+ int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ ll = rdma_port_get_link_layer(&dev->ib_dev, port);
+ }
+
if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
- attr_mask & ~IB_M_QP_MOD_VEND_MASK)) {
+ attr_mask & ~IB_M_QP_MOD_VEND_MASK, ll)) {
pr_debug("qpn 0x%x: invalid attribute mask specified "
"for transition %d to %d. qp_type %d,"
" attr_mask 0x%x\n",
@@ -2299,11 +2358,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
goto out;
}
- if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
- (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
- IB_LINK_LAYER_ETHERNET))
- goto out;
-
if (attr_mask & IB_QP_PKEY_INDEX) {
int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
@@ -2421,11 +2475,11 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
spc = MLX4_INLINE_ALIGN -
((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
if (header_size <= spc) {
- inl->byte_count = cpu_to_be32(1U << 31 | header_size);
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
memcpy(inl + 1, sqp->header_buf, header_size);
i = 1;
} else {
- inl->byte_count = cpu_to_be32(1U << 31 | spc);
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
memcpy(inl + 1, sqp->header_buf, spc);
inl = (void *) (inl + 1) + spc;
@@ -2444,7 +2498,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
* of 16 mod 64.
*/
wmb();
- inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc));
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
i = 2;
}
@@ -2470,7 +2524,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
int is_eth;
int is_vlan = 0;
int is_grh;
- u16 vlan = 0;
+ u16 uninitialized_var(vlan);
int err = 0;
send_size = 0;
@@ -2497,8 +2551,10 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
return err;
}
- vlan = rdma_get_vlan_id(&sgid);
- is_vlan = vlan < 0x1000;
+ if (is_eth && ah->av.eth.vlan != 0xffff) {
+ vlan = cpu_to_be16(ah->av.eth.vlan) & 0x0fff;
+ is_vlan = 1;
+ }
}
ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
@@ -2565,7 +2621,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
}
if (is_eth) {
- u8 smac[6];
+ u8 *smac;
struct in6_addr in6;
u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
@@ -2577,8 +2633,13 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
memcpy(&in6, sgid.raw, sizeof(in6));
- rdma_get_ll_mac(&in6, smac);
+
+ if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev))
+ smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]);
+ else
+ smac = ah->av.eth.s_mac; /* use the src mac of the tunnel */
memcpy(sqp->ud_header.eth.smac_h, smac, 6);
+
if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
if (!is_vlan) {
@@ -2628,11 +2689,11 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
spc = MLX4_INLINE_ALIGN -
((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
if (header_size <= spc) {
- inl->byte_count = cpu_to_be32(1U << 31 | header_size);
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
memcpy(inl + 1, sqp->header_buf, header_size);
i = 1;
} else {
- inl->byte_count = cpu_to_be32(1U << 31 | spc);
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
memcpy(inl + 1, sqp->header_buf, spc);
inl = (void *) (inl + 1) + spc;
@@ -2651,7 +2712,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
* of 16 mod 64.
*/
wmb();
- inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc));
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
i = 2;
}
@@ -2679,9 +2740,12 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq
static __be32 convert_access(int acc)
{
- return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) |
- (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
- (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) |
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ?
+ cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ?
+ cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ?
+ cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) |
(acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) |
cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
}
@@ -2707,6 +2771,24 @@ static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
fseg->reserved[1] = 0;
}
+static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr)
+{
+ bseg->flags1 =
+ convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) &
+ cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ |
+ MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE |
+ MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC);
+ bseg->flags2 = 0;
+ if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2)
+ bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2);
+ if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED)
+ bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED);
+ bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey);
+ bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey);
+ bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr);
+ bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length);
+}
+
static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
{
iseg->mem_key = cpu_to_be32(rkey);
@@ -2792,23 +2874,25 @@ static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_
hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);
hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+ memcpy(hdr.mac, ah->av.eth.mac, 6);
+ hdr.vlan = cpu_to_be16(ah->av.eth.vlan);
spc = MLX4_INLINE_ALIGN -
((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
if (sizeof (hdr) <= spc) {
memcpy(inl + 1, &hdr, sizeof (hdr));
wmb();
- inl->byte_count = cpu_to_be32(1U << 31 | sizeof (hdr));
+ inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
i = 1;
} else {
memcpy(inl + 1, &hdr, spc);
wmb();
- inl->byte_count = cpu_to_be32(1U << 31 | spc);
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
inl = (void *) (inl + 1) + spc;
memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
wmb();
- inl->byte_count = cpu_to_be32(1U << 31 | (sizeof (hdr) - spc));
+ inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
i = 2;
}
@@ -2833,7 +2917,7 @@ static void set_mlx_icrc_seg(void *dseg)
*/
wmb();
- iseg->byte_count = cpu_to_be32((1U << 31) | 4);
+ iseg->byte_count = cpu_to_be32((1 << 31) | 4);
}
static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
@@ -2901,7 +2985,7 @@ static void add_zero_len_inline(void *wqe)
{
struct mlx4_wqe_inline_seg *inl = wqe;
memset(wqe, 0, 16);
- inl->byte_count = cpu_to_be32(1U << 31);
+ inl->byte_count = cpu_to_be32(1 << 31);
}
static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
@@ -3102,6 +3186,12 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
break;
+ case IB_WR_BIND_MW:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_bind_seg(wqe, wr);
+ wqe += sizeof(struct mlx4_wqe_bind_seg);
+ size += sizeof(struct mlx4_wqe_bind_seg) / 16;
default:
/* No extra segments required for sends */
break;
@@ -3246,14 +3336,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
*/
wmb();
- if (wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+ if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
*bad_wr = wr;
err = -EINVAL;
goto out;
}
ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
- (ind & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0) | blh;
+ (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
stamp = ind + qp->sq_spare_wqes;
ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
@@ -3576,6 +3666,15 @@ done:
qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+ if (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL)
+ qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL;
+
+ if (qp->flags & MLX4_IB_QP_CAP_MANAGED_SEND)
+ qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND;
+
+ if (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV)
+ qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
+
qp_init_attr->qpg_type = ibqp->qpg_type;
if (ibqp->qpg_type == IB_QPG_PARENT)
qp_init_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz;
@@ -3586,4 +3685,3 @@ out:
mutex_unlock(&qp->mutex);
return err;
}
-
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c b/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
index 6837b86..df4549f 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
@@ -56,8 +56,8 @@ static ssize_t show_admin_alias_guid(struct device *dev,
record_num = mlx4_ib_iov_dentry->entry_num / 8 ;
guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ;
- return sprintf(buf, "%llx\n", (long long)
- be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid.
+ return sprintf(buf, "%llx\n",
+ (long long)be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid.
ports_guid[port->num - 1].
all_rec_per_port[record_num].
all_recs[8 * guid_index_in_rec]));
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
index 088e440..3fed07c 100644
--- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -672,8 +672,8 @@ static int mthca_destroy_qp(struct ib_qp *qp)
return 0;
}
-static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
- int comp_vector,
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
+ struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
struct ib_udata *udata)
{
@@ -681,6 +681,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
struct mthca_cq *cq;
int nent;
int err;
+ int entries = attr->cqe;
if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
return ERR_PTR(-EINVAL);
@@ -1010,12 +1011,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata, int mr_id)
{
struct mthca_dev *dev = to_mdev(pd->device);
- struct ib_umem_chunk *chunk;
+ struct scatterlist *sg;
struct mthca_mr *mr;
struct mthca_reg_mr ucmd;
u64 *pages;
int shift, n, len;
- int i, j, k;
+ int i, k, entry;
int err = 0;
int write_mtt_size;
@@ -1044,10 +1045,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
shift = ffs(mr->umem->page_size) - 1;
- n = 0;
- list_for_each_entry(chunk, &mr->umem->chunk_list, list)
- n += chunk->nents;
-
+ n = mr->umem->nmap;;
mr->mtt = mthca_alloc_mtt(dev, n);
if (IS_ERR(mr->mtt)) {
err = PTR_ERR(mr->mtt);
@@ -1064,25 +1062,27 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
- list_for_each_entry(chunk, &mr->umem->chunk_list, list)
- for (j = 0; j < chunk->nmap; ++j) {
- len = sg_dma_len(&chunk->page_list[j]) >> shift;
- for (k = 0; k < len; ++k) {
- pages[i++] = sg_dma_address(&chunk->page_list[j]) +
- mr->umem->page_size * k;
- /*
- * Be friendly to write_mtt and pass it chunks
- * of appropriate size.
- */
- if (i == write_mtt_size) {
- err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
- if (err)
- goto mtt_done;
- n += i;
- i = 0;
- }
+ for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
+ len = sg_dma_len(sg) >> shift;
+ for (k = 0; k < len; ++k) {
+ pages[i++] = sg_dma_address(sg) +
+ mr->umem->page_size * k;
+ /*
+ * Be friendly to write_mtt and pass it chunks
+ * of appropriate size.
+ */
+ if (i == write_mtt_size) {
+ err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+ if (err)
+ goto mtt_done;
+ n += i;
+ i = 0;
}
}
+ }
+
+
+
if (i)
err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
index 2264bcd..b4c70b4 100644
--- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -870,7 +870,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
- if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+ attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
mthca_dbg(dev, "Bad QP transition (transport %d) "
"%d->%d with attr 0x%08x\n",
qp->transport, cur_state, new_state,
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile b/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
deleted file mode 100644
index 3090100..0000000
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o
-
-ib_ipoib-y := ipoib_main.o \
- ipoib_ib.o \
- ipoib_multicast.o \
- ipoib_verbs.o \
- ipoib_vlan.o \
- ipoib_ethtool.o
-ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o
-ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o
-
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
index 7d5e175..eb269a4 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -80,6 +80,7 @@
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/mutex.h>
+#include <linux/rbtree.h>
#include <asm/atomic.h>
@@ -313,6 +314,7 @@ struct ipoib_ethtool_st {
*/
struct ipoib_dev_priv {
spinlock_t lock;
+ spinlock_t drain_lock;
struct ifnet *dev;
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 4fb39b4..814938c 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -383,6 +383,7 @@ ipoib_poll(struct ipoib_dev_priv *priv)
int n, i;
poll_more:
+ spin_lock(&priv->drain_lock);
for (;;) {
n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
@@ -401,6 +402,7 @@ poll_more:
if (n != IPOIB_NUM_WC)
break;
}
+ spin_unlock(&priv->drain_lock);
if (ib_req_notify_cq(priv->recv_cq,
IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS))
@@ -707,6 +709,7 @@ void ipoib_drain_cq(struct ipoib_dev_priv *priv)
{
int i, n;
+ spin_lock(&priv->drain_lock);
do {
n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
for (i = 0; i < n; ++i) {
@@ -727,6 +730,7 @@ void ipoib_drain_cq(struct ipoib_dev_priv *priv)
ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
}
} while (n == IPOIB_NUM_WC);
+ spin_unlock(&priv->drain_lock);
spin_lock(&priv->lock);
while (ipoib_poll_tx(priv))
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 695621f..35e16417 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -832,6 +832,7 @@ ipoib_priv_alloc(void)
priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);
spin_lock_init(&priv->lock);
+ spin_lock_init(&priv->drain_lock);
mutex_init(&priv->vlan_mutex);
INIT_LIST_HEAD(&priv->path_list);
INIT_LIST_HEAD(&priv->child_intfs);
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 9c7bcec..4c04da1 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -466,12 +466,20 @@ void ipoib_mcast_join_task(struct work_struct *work)
struct ipoib_dev_priv *priv =
container_of(work, struct ipoib_dev_priv, mcast_task.work);
struct ifnet *dev = priv->dev;
+ struct ib_port_attr attr;
ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags);
if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
return;
+ if (ib_query_port(priv->ca, priv->port, &attr) ||
+ attr.state != IB_PORT_ACTIVE) {
+ ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n",
+ __func__, attr.state);
+ return;
+ }
+
if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
ipoib_warn(priv, "ib_query_gid() failed\n");
else
OpenPOWER on IntegriCloud