From 14d3a3b2498edadec344cb11e60e66091f5daf63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Dec 2015 11:53:03 -0800 Subject: IB: add a proper completion queue abstraction This adds an abstraction that allows ULPs to simply pass a completion object and completion callback with each submitted WR and let the RDMA core handle the nitty gritty details of how to handle completion interrupts and poll the CQ. In detail there is a new ib_cqe structure which just contains the completion callback, and which can be used to get at the containing object using container_of. It is pointed to by the WR and WC as an alternative to the wr_id field, similar to how many ULPs already use the field to store a pointer using casts. A driver using the new completion callbacks allocates it's CQs using the new ib_create_cq API, which in addition to the number of CQEs and the completion vectors also takes a mode on how we poll for CQEs. Three modes are available: direct for drivers that never take CQ interrupts and just poll for them, softirq to poll from softirq context using the to be renamed blk-iopoll infrastructure which takes care of rearming and budgeting, or a workqueue for consumer who want to be called from user context. Thanks a lot to Sagi Grimberg who helped reviewing the API, wrote the current version of the workqueue code because my two previous attempts sucked too much and converted the iSER initiator to the new API. Signed-off-by: Christoph Hellwig --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/cq.c | 209 +++++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/device.c | 15 ++- 3 files changed, 223 insertions(+), 3 deletions(-) create mode 100644 drivers/infiniband/core/cq.c (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d43a899..ae48d8740 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) -ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ +ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c new file mode 100644 index 0000000..a754fc7 --- /dev/null +++ b/drivers/infiniband/core/cq.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2015 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include +#include + +/* # of WCs to poll for with a single call to ib_poll_cq */ +#define IB_POLL_BATCH 16 + +/* # of WCs to iterate over before yielding */ +#define IB_POLL_BUDGET_IRQ 256 +#define IB_POLL_BUDGET_WORKQUEUE 65536 + +#define IB_POLL_FLAGS \ + (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) + +static int __ib_process_cq(struct ib_cq *cq, int budget) +{ + int i, n, completed = 0; + + while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) { + for (i = 0; i < n; i++) { + struct ib_wc *wc = &cq->wc[i]; + + if (wc->wr_cqe) + wc->wr_cqe->done(cq, wc); + else + WARN_ON_ONCE(wc->status == IB_WC_SUCCESS); + } + + completed += n; + + if (n != IB_POLL_BATCH || + (budget != -1 && completed >= budget)) + break; + } + + return completed; +} + +/** + * ib_process_direct_cq - process a CQ in caller context + * @cq: CQ to process + * @budget: number of CQEs to poll for + * + * This function is used to process all outstanding CQ entries on a + * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different + * context and does not ask for completion interrupts from the HCA. + * + * Note: for compatibility reasons -1 can be passed in %budget for unlimited + * polling. Do not use this feature in new code, it will be removed soon. + */ +int ib_process_cq_direct(struct ib_cq *cq, int budget) +{ + WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT); + + return __ib_process_cq(cq, budget); +} +EXPORT_SYMBOL(ib_process_cq_direct); + +static void ib_cq_completion_direct(struct ib_cq *cq, void *private) +{ + WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); +} + +static int ib_poll_handler(struct irq_poll *iop, int budget) +{ + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + int completed; + + completed = __ib_process_cq(cq, budget); + if (completed < budget) { + irq_poll_complete(&cq->iop); + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + irq_poll_sched(&cq->iop); + } + + return completed; +} + +static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) +{ + irq_poll_sched(&cq->iop); +} + +static void ib_cq_poll_work(struct work_struct *work) +{ + struct ib_cq *cq = container_of(work, struct ib_cq, work); + int completed; + + completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE); + if (completed >= IB_POLL_BUDGET_WORKQUEUE || + ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + queue_work(ib_comp_wq, &cq->work); +} + +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) +{ + queue_work(ib_comp_wq, &cq->work); +} + +/** + * ib_alloc_cq - allocate a completion queue + * @dev: device to allocate the CQ for + * @private: driver private data, accessible from cq->cq_context + * @nr_cqe: number of CQEs to allocate + * @comp_vector: HCA completion vectors for this CQ + * @poll_ctx: context to poll the CQ from. + * + * This is the proper interface to allocate a CQ for in-kernel users. A + * CQ allocated with this interface will automatically be polled from the + * specified context. The ULP needs must use wr->wr_cqe instead of wr->wr_id + * to use this CQ abstraction. + */ +struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +{ + struct ib_cq_init_attr cq_attr = { + .cqe = nr_cqe, + .comp_vector = comp_vector, + }; + struct ib_cq *cq; + int ret = -ENOMEM; + + cq = dev->create_cq(dev, &cq_attr, NULL, NULL); + if (IS_ERR(cq)) + return cq; + + cq->device = dev; + cq->uobject = NULL; + cq->event_handler = NULL; + cq->cq_context = private; + cq->poll_ctx = poll_ctx; + atomic_set(&cq->usecnt, 0); + + cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); + if (!cq->wc) + goto out_destroy_cq; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + cq->comp_handler = ib_cq_completion_direct; + break; + case IB_POLL_SOFTIRQ: + cq->comp_handler = ib_cq_completion_softirq; + + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + case IB_POLL_WORKQUEUE: + cq->comp_handler = ib_cq_completion_workqueue; + INIT_WORK(&cq->work, ib_cq_poll_work); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + default: + ret = -EINVAL; + goto out_free_wc; + } + + return cq; + +out_free_wc: + kfree(cq->wc); +out_destroy_cq: + cq->device->destroy_cq(cq); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_alloc_cq); + +/** + * ib_free_cq - free a completion queue + * @cq: completion queue to free. + */ +void ib_free_cq(struct ib_cq *cq) +{ + int ret; + + if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) + return; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + break; + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + flush_work(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + + kfree(cq->wc); + ret = cq->device->destroy_cq(cq); + WARN_ON_ONCE(ret); +} +EXPORT_SYMBOL(ib_free_cq); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 179e813..6421d23 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -58,6 +58,7 @@ struct ib_client_data { bool going_down; }; +struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -954,10 +955,18 @@ static int __init ib_core_init(void) if (!ib_wq) return -ENOMEM; + ib_comp_wq = alloc_workqueue("ib-comp-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, + WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_wq) { + ret = -ENOMEM; + goto err; + } + ret = class_register(&ib_class); if (ret) { printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); - goto err; + goto err_comp; } ret = ibnl_init(); @@ -972,7 +981,8 @@ static int __init ib_core_init(void) err_sysfs: class_unregister(&ib_class); - +err_comp: + destroy_workqueue(ib_comp_wq); err: destroy_workqueue(ib_wq); return ret; @@ -983,6 +993,7 @@ static void __exit ib_core_cleanup(void) ib_cache_cleanup(); ibnl_cleanup(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } -- cgit v1.1 From 3e153a93a1c12e3354dd38cca414fb51a15136a2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 18 Dec 2015 10:59:44 +0200 Subject: IB/core: Save the device attributes on the device structure This way both the IB core and upper level drivers can access these cached device attributes rather than querying or caching them on their own. Signed-off-by: Ira Weiny Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 179e813..568592e3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -325,6 +325,7 @@ int ib_register_device(struct ib_device *device, { int ret; struct ib_client *client; + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; mutex_lock(&device_mutex); @@ -352,6 +353,13 @@ int ib_register_device(struct ib_device *device, goto out; } + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); + if (ret) { + printk(KERN_WARNING "Couldn't query the device attributes\n"); + goto out; + } + ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", -- cgit v1.1 From 86bee4c9c126b4f73e3f152cd43c806cac9135ad Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Fri, 18 Dec 2015 10:59:45 +0200 Subject: IB/core: Avoid calling ib_query_device Use the cached copy of the attributes present on the device, except for the case of a query originating from user-space, where we have to invoke the driver query_device entry, so they can fill in their udata. Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 12 +----------- drivers/infiniband/core/cma.c | 8 -------- drivers/infiniband/core/fmr_pool.c | 20 ++------------------ drivers/infiniband/core/sysfs.c | 14 ++++---------- drivers/infiniband/core/uverbs_cmd.c | 23 ++++------------------- drivers/infiniband/core/verbs.c | 8 +------- 6 files changed, 12 insertions(+), 73 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 0a26dd6..7fa2b94 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3731,16 +3731,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, } EXPORT_SYMBOL(ib_cm_init_qp_attr); -static void cm_get_ack_delay(struct cm_device *cm_dev) -{ - struct ib_device_attr attr; - - if (ib_query_device(cm_dev->ib_device, &attr)) - cm_dev->ack_delay = 0; /* acks will rely on packet life time */ - else - cm_dev->ack_delay = attr.local_ca_ack_delay; -} - static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, char *buf) { @@ -3852,7 +3842,7 @@ static void cm_add_one(struct ib_device *ib_device) return; cm_dev->ib_device = ib_device; - cm_get_ack_delay(cm_dev); + cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d2d5d00..f8dfc63 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1894,7 +1894,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int ret; - struct ib_device_attr attr; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; @@ -1936,13 +1935,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - ret = ib_query_device(conn_id->id.device, &attr); - if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 9f5ad7c..6ac3683 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -212,7 +212,6 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, { struct ib_device *device; struct ib_fmr_pool *pool; - struct ib_device_attr *attr; int i; int ret; int max_remaps; @@ -228,25 +227,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, return ERR_PTR(-ENOSYS); } - attr = kmalloc(sizeof *attr, GFP_KERNEL); - if (!attr) { - printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); - return ERR_PTR(-ENOMEM); - } - - ret = ib_query_device(device, attr); - if (ret) { - printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); - kfree(attr); - return ERR_PTR(ret); - } - - if (!attr->max_map_per_fmr) + if (!device->attrs.max_map_per_fmr) max_remaps = IB_FMR_MAX_REMAPS; else - max_remaps = attr->max_map_per_fmr; - - kfree(attr); + max_remaps = device->attrs.max_map_per_fmr; pool = kmalloc(sizeof *pool, GFP_KERNEL); if (!pool) { diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index b1f37d4..1d5b4b0 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -614,18 +614,12 @@ static ssize_t show_sys_image_guid(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); - struct ib_device_attr attr; - ssize_t ret; - - ret = ib_query_device(dev, &attr); - if (ret) - return ret; return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } static ssize_t show_node_guid(struct device *device, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 1c02dea..9561056 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -291,9 +291,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct ib_device_attr dev_attr; -#endif struct ib_ucontext *ucontext; struct file *filp; int ret; @@ -342,10 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->odp_mrs_count = 0; INIT_LIST_HEAD(&ucontext->no_private_counters); - ret = ib_query_device(ib_dev, &dev_attr); - if (ret) - goto err_free; - if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; #endif @@ -447,8 +441,6 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; - struct ib_device_attr attr; - int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -456,12 +448,8 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_device(ib_dev, &attr); - if (ret) - return ret; - memset(&resp, 0, sizeof resp); - copy_query_dev_fields(file, ib_dev, &resp, &attr); + copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -986,11 +974,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, } if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { - struct ib_device_attr attr; - - ret = ib_query_device(pd->device, &attr); - if (ret || !(attr.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING)) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); ret = -EINVAL; goto err_put; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 545906d..7617d94 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -229,12 +229,6 @@ EXPORT_SYMBOL(rdma_port_get_link_layer); struct ib_pd *ib_alloc_pd(struct ib_device *device) { struct ib_pd *pd; - struct ib_device_attr devattr; - int rc; - - rc = ib_query_device(device, &devattr); - if (rc) - return ERR_PTR(rc); pd = device->alloc_pd(device, NULL, NULL); if (IS_ERR(pd)) @@ -245,7 +239,7 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device) pd->local_mr = NULL; atomic_set(&pd->usecnt, 0); - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else { struct ib_mr *mr; -- cgit v1.1 From 182a2da0c768a9ec64abb0d6009667057f1c06af Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Fri, 18 Dec 2015 10:59:50 +0200 Subject: IB/core: Remove ib_query_device The copy of the attributes present on the device is now used by all consumers except for uverbs in case of serving user-space query, where dev->query_device is called. Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 568592e3..6def2f7 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -636,25 +636,6 @@ void ib_dispatch_event(struct ib_event *event) EXPORT_SYMBOL(ib_dispatch_event); /** - * ib_query_device - Query IB device attributes - * @device:Device to query - * @device_attr:Device attributes - * - * ib_query_device() returns the attributes of a device through the - * @device_attr pointer. - */ -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr) -{ - struct ib_udata uhw = {.outlen = 0, .inlen = 0}; - - memset(device_attr, 0, sizeof(*device_attr)); - - return device->query_device(device, device_attr, &uhw); -} -EXPORT_SYMBOL(ib_query_device); - -/** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query -- cgit v1.1 From f3906bd36087dd3440ecaf6e044690374d01f927 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Oct 2015 16:52:39 +0200 Subject: IB/core: Refactor GID cache's ib_dispatch_event Refactor ib_dispatch_event into a new function in order to avoid duplicating code in the next patch. Signed-off-by: Matan Barak Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 89bebea..f0703d2 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -112,6 +112,19 @@ struct ib_gid_table { struct ib_gid_table_entry *data_vec; }; +static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +{ + if (rdma_cap_roce_gid_table(ib_dev, port)) { + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event(&event); + } +} + static int write_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, const union ib_gid *gid, @@ -164,15 +177,9 @@ static int write_gid(struct ib_device *ib_dev, u8 port, write_unlock_irqrestore(&table->data_vec[ix].lock, flags); - if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) { - struct ib_event event; + if (!ret) + dispatch_gid_change_event(ib_dev, port); - event.device = ib_dev; - event.element.port_num = port; - event.event = IB_EVENT_GID_CHANGE; - - ib_dispatch_event(&event); - } return ret; } -- cgit v1.1 From 9c584f04959620e587b3b3d358076dab48a8893c Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Oct 2015 16:52:40 +0200 Subject: IB/core: Change per-entry lock in RoCE GID table to one lock Previously, IB GID cached used a lock per entry. This could result in spending a lot of CPU cycles for locking and unlocking just in order to find a GID. Changing this in favor of one lock per a GID table. Signed-off-by: Matan Barak Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 143 +++++++++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 54 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index f0703d2..83aee7c 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -81,10 +81,6 @@ enum gid_table_write_action { }; struct ib_gid_table_entry { - /* This lock protects an entry from being - * read and written simultaneously. - */ - rwlock_t lock; unsigned long props; union ib_gid gid; struct ib_gid_attr attr; @@ -109,6 +105,10 @@ struct ib_gid_table { * are locked by this lock. **/ struct mutex lock; + /* This lock protects the table entries from being + * read and written simultaneously. + */ + rwlock_t rwlock; struct ib_gid_table_entry *data_vec; }; @@ -125,6 +125,10 @@ static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) } } +/* This function expects that rwlock will be write locked in all + * scenarios and that lock will be locked in sleep-able (RoCE) + * scenarios. + */ static int write_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, const union ib_gid *gid, @@ -134,16 +138,14 @@ static int write_gid(struct ib_device *ib_dev, u8 port, { int ret = 0; struct net_device *old_net_dev; - unsigned long flags; /* in rdma_cap_roce_gid_table, this funciton should be protected by a * sleep-able lock. */ - write_lock_irqsave(&table->data_vec[ix].lock, flags); if (rdma_cap_roce_gid_table(ib_dev, port)) { table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; - write_unlock_irqrestore(&table->data_vec[ix].lock, flags); + write_unlock_irq(&table->rwlock); /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by * RoCE providers and thus only updates the cache. */ @@ -153,7 +155,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, else if (action == GID_TABLE_WRITE_ACTION_DEL) ret = ib_dev->del_gid(ib_dev, port, ix, &table->data_vec[ix].context); - write_lock_irqsave(&table->data_vec[ix].lock, flags); + write_lock_irq(&table->rwlock); } old_net_dev = table->data_vec[ix].attr.ndev; @@ -175,11 +177,6 @@ static int write_gid(struct ib_device *ib_dev, u8 port, table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; - write_unlock_irqrestore(&table->data_vec[ix].lock, flags); - - if (!ret) - dispatch_gid_change_event(ib_dev, port); - return ret; } @@ -208,6 +205,7 @@ static int del_gid(struct ib_device *ib_dev, u8 port, GID_TABLE_WRITE_ACTION_DEL, default_gid); } +/* rwlock should be read locked */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask) @@ -215,34 +213,29 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, int i; for (i = 0; i < table->sz; i++) { - unsigned long flags; struct ib_gid_attr *attr = &table->data_vec[i].attr; - read_lock_irqsave(&table->data_vec[i].lock, flags); if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) - goto next; + continue; if (mask & GID_ATTR_FIND_MASK_GID && memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) - goto next; + continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) - goto next; + continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) != default_gid) - goto next; + continue; - read_unlock_irqrestore(&table->data_vec[i].lock, flags); - return i; -next: - read_unlock_irqrestore(&table->data_vec[i].lock, flags); + break; } - return -1; + return i == table->sz ? -1 : i; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) @@ -282,6 +275,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, } mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_NETDEV); @@ -295,9 +289,12 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, goto out_unlock; } - add_gid(ib_dev, port, table, ix, gid, attr, false); + ret = add_gid(ib_dev, port, table, ix, gid, attr, false); + if (!ret) + dispatch_gid_change_event(ib_dev, port); out_unlock: + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return ret; } @@ -312,6 +309,7 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | @@ -320,9 +318,11 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, if (ix < 0) goto out_unlock; - del_gid(ib_dev, port, table, ix, false); + if (!del_gid(ib_dev, port, table, ix, false)) + dispatch_gid_change_event(ib_dev, port); out_unlock: + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return 0; } @@ -333,16 +333,24 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; + bool deleted = false; table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); for (ix = 0; ix < table->sz; ix++) if (table->data_vec[ix].attr.ndev == ndev) - del_gid(ib_dev, port, table, ix, false); + if (!del_gid(ib_dev, port, table, ix, false)) + deleted = true; + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); + return 0; } @@ -351,18 +359,14 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; - unsigned long flags; table = ports_table[port - rdma_start_port(ib_dev)]; if (index < 0 || index >= table->sz) return -EINVAL; - read_lock_irqsave(&table->data_vec[index].lock, flags); - if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) { - read_unlock_irqrestore(&table->data_vec[index].lock, flags); + if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) return -EAGAIN; - } memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); if (attr) { @@ -371,7 +375,6 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, dev_hold(attr->ndev); } - read_unlock_irqrestore(&table->data_vec[index].lock, flags); return 0; } @@ -385,17 +388,21 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, struct ib_gid_table *table; u8 p; int local_index; + unsigned long flags; for (p = 0; p < ib_dev->phys_port_cnt; p++) { table = ports_table[p]; + read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, val, false, mask); if (local_index >= 0) { if (index) *index = local_index; if (port) *port = p + rdma_start_port(ib_dev); + read_unlock_irqrestore(&table->rwlock, flags); return 0; } + read_unlock_irqrestore(&table->rwlock, flags); } return -ENOENT; @@ -426,6 +433,7 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID; struct ib_gid_attr val = {.ndev = ndev}; + unsigned long flags; if (port < rdma_start_port(ib_dev) || port > rdma_end_port(ib_dev)) @@ -436,13 +444,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; + read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask); if (local_index >= 0) { if (index) *index = local_index; + read_unlock_irqrestore(&table->rwlock, flags); return 0; } + read_unlock_irqrestore(&table->rwlock, flags); return -ENOENT; } EXPORT_SYMBOL(ib_find_cached_gid_by_port); @@ -479,6 +490,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned int i; + unsigned long flags; bool found = false; if (!ports_table) @@ -491,11 +503,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, table = ports_table[port - rdma_start_port(ib_dev)]; + read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_attr attr; - unsigned long flags; - read_lock_irqsave(&table->data_vec[i].lock, flags); if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) goto next; @@ -508,11 +519,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, found = true; next: - read_unlock_irqrestore(&table->data_vec[i].lock, flags); - if (found) break; } + read_unlock_irqrestore(&table->rwlock, flags); if (!found) return -ENOENT; @@ -524,9 +534,9 @@ next: static struct ib_gid_table *alloc_gid_table(int sz) { - unsigned int i; struct ib_gid_table *table = kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + if (!table) return NULL; @@ -537,9 +547,7 @@ static struct ib_gid_table *alloc_gid_table(int sz) mutex_init(&table->lock); table->sz = sz; - - for (i = 0; i < sz; i++) - rwlock_init(&table->data_vec[i].lock); + rwlock_init(&table->rwlock); return table; @@ -560,17 +568,24 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { int i; + bool deleted = false; if (!table) return; + write_lock_irq(&table->rwlock); for (i = 0; i < table->sz; ++i) { if (memcmp(&table->data_vec[i].gid, &zgid, sizeof(table->data_vec[i].gid))) - del_gid(ib_dev, port, table, i, - table->data_vec[i].props & - GID_ATTR_FIND_MASK_DEFAULT); + if (!del_gid(ib_dev, port, table, i, + table->data_vec[i].props & + GID_ATTR_FIND_MASK_DEFAULT)) + deleted = true; } + write_unlock_irq(&table->rwlock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, @@ -592,6 +607,7 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, gid_attr.ndev = ndev; mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT); /* Coudn't find default GID location */ @@ -604,23 +620,31 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) goto unlock; - if ((memcmp(¤t_gid, &zgid, sizeof(current_gid)) || - memcmp(¤t_gid_attr, &zattr, - sizeof(current_gid_attr))) && - del_gid(ib_dev, port, table, ix, true)) { - pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", - ix, gid.raw); - goto unlock; + if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || + memcmp(¤t_gid_attr, &zattr, + sizeof(current_gid_attr))) { + if (del_gid(ib_dev, port, table, ix, true)) { + pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", + ix, gid.raw); + goto unlock; + } else { + dispatch_gid_change_event(ib_dev, port); + } } - if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) - if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { + if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) { pr_warn("ib_cache_gid: unable to add default gid %pI6\n", gid.raw); + } else { + dispatch_gid_change_event(ib_dev, port); + } + } unlock: if (current_gid_attr.ndev) dev_put(current_gid_attr.ndev); + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); } @@ -735,10 +759,19 @@ int ib_get_cached_gid(struct ib_device *device, union ib_gid *gid, struct ib_gid_attr *gid_attr) { + int res; + unsigned long flags; + struct ib_gid_table **ports_table = device->cache.gid_cache; + struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; - return __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_lock_irqsave(&table->rwlock, flags); + res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_unlock_irqrestore(&table->rwlock, flags); + + return res; } EXPORT_SYMBOL(ib_get_cached_gid); @@ -963,10 +996,12 @@ static void ib_cache_update(struct ib_device *device, device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; if (!use_roce_gid_table) { + write_lock(&table->rwlock); for (i = 0; i < gid_cache->table_len; i++) { modify_gid(device, port, table, i, gid_cache->table + i, &zattr, false); } + write_unlock(&table->rwlock); } device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; -- cgit v1.1 From cee3c4d0c56876f46f4584385603adb30a7cacf7 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Oct 2015 16:52:41 +0200 Subject: IB/core: don't search the GID table twice Previously, we've searched the GID table twice: first when we searched the table for a GID matching the proposed new one, and second when we didn't find a match, we searched again for an empty GID slot in the table. Instead, search the table once noting the first empty slot as we search for our target GID. Signed-off-by: Matan Barak Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 53 +++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 18 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 83aee7c..bfd0a65 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -208,19 +208,33 @@ static int del_gid(struct ib_device *ib_dev, u8 port, /* rwlock should be read locked */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, - unsigned long mask) + unsigned long mask, int *pempty) { - int i; + int i = 0; + int found = -1; + int empty = pempty ? -1 : 0; - for (i = 0; i < table->sz; i++) { - struct ib_gid_attr *attr = &table->data_vec[i].attr; + while (i < table->sz && (found < 0 || empty < 0)) { + struct ib_gid_table_entry *data = &table->data_vec[i]; + struct ib_gid_attr *attr = &data->attr; + int curr_index = i; + i++; - if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) + if (data->props & GID_TABLE_ENTRY_INVALID) + continue; + + if (empty < 0) + if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && + !memcmp(attr, &zattr, sizeof(*attr)) && + !data->props) + empty = curr_index; + + if (found >= 0) continue; if (mask & GID_ATTR_FIND_MASK_GID && - memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) + memcmp(gid, &data->gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && @@ -228,14 +242,17 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && - !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) != + !!(data->props & GID_TABLE_ENTRY_DEFAULT) != default_gid) continue; - break; + found = curr_index; } - return i == table->sz ? -1 : i; + if (pempty) + *pempty = empty; + + return found; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) @@ -252,6 +269,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, int ix; int ret = 0; struct net_device *idev; + int empty; table = ports_table[port - rdma_start_port(ib_dev)]; @@ -278,18 +296,16 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_NETDEV); + GID_ATTR_FIND_MASK_NETDEV, &empty); if (ix >= 0) goto out_unlock; - ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_DEFAULT); - if (ix < 0) { + if (empty < 0) { ret = -ENOSPC; goto out_unlock; } - ret = add_gid(ib_dev, port, table, ix, gid, attr, false); + ret = add_gid(ib_dev, port, table, empty, gid, attr, false); if (!ret) dispatch_gid_change_event(ib_dev, port); @@ -314,7 +330,8 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_NETDEV | - GID_ATTR_FIND_MASK_DEFAULT); + GID_ATTR_FIND_MASK_DEFAULT, + NULL); if (ix < 0) goto out_unlock; @@ -393,7 +410,7 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, for (p = 0; p < ib_dev->phys_port_cnt; p++) { table = ports_table[p]; read_lock_irqsave(&table->rwlock, flags); - local_index = find_gid(table, gid, val, false, mask); + local_index = find_gid(table, gid, val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; @@ -445,7 +462,7 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, mask |= GID_ATTR_FIND_MASK_NETDEV; read_lock_irqsave(&table->rwlock, flags); - local_index = find_gid(table, gid, &val, false, mask); + local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; @@ -608,7 +625,7 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, mutex_lock(&table->lock); write_lock_irq(&table->rwlock); - ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT); + ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT, NULL); /* Coudn't find default GID location */ WARN_ON(ix < 0); -- cgit v1.1 From b39ffa1df505378336a85064ad9ec403765bbb0b Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:47 +0200 Subject: IB/core: Add gid_type to gid attribute In order to support multiple GID types, we need to store the gid_type with each GID. This is also aligned with the RoCE v2 annex "RoCEv2 PORT GID table entries shall have a "GID type" attribute that denotes the L3 Address type". The currently supported GID is IB_GID_TYPE_IB which is also RoCE v1 GID type. This implies that gid_type should be added to roce_gid_table meta-data. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 144 ++++++++++++++++++++---------- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/core/cma.c | 3 +- drivers/infiniband/core/core_priv.h | 4 + drivers/infiniband/core/device.c | 9 +- drivers/infiniband/core/multicast.c | 2 +- drivers/infiniband/core/roce_gid_mgmt.c | 60 +++++++++++-- drivers/infiniband/core/sa_query.c | 5 +- drivers/infiniband/core/uverbs_marshall.c | 1 + drivers/infiniband/core/verbs.c | 1 + 10 files changed, 170 insertions(+), 61 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index bfd0a65..06e47e1 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -64,6 +64,7 @@ enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, + GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_props { @@ -125,6 +126,19 @@ static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) } } +static const char * const gid_type_str[] = { + [IB_GID_TYPE_IB] = "IB/RoCE v1", +}; + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) +{ + if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) + return gid_type_str[gid_type]; + + return "Invalid GID type"; +} +EXPORT_SYMBOL(ib_cache_gid_type_str); + /* This function expects that rwlock will be write locked in all * scenarios and that lock will be locked in sleep-able (RoCE) * scenarios. @@ -233,6 +247,10 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, if (found >= 0) continue; + if (mask & GID_ATTR_FIND_MASK_GID_TYPE && + attr->gid_type != val->gid_type) + continue; + if (mask & GID_ATTR_FIND_MASK_GID && memcmp(gid, &data->gid, sizeof(*gid))) continue; @@ -296,6 +314,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV, &empty); if (ix >= 0) goto out_unlock; @@ -329,6 +348,7 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV | GID_ATTR_FIND_MASK_DEFAULT, NULL); @@ -427,11 +447,13 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, static int ib_cache_gid_find(struct ib_device *ib_dev, const union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port, u16 *index) { - unsigned long mask = GID_ATTR_FIND_MASK_GID; - struct ib_gid_attr gid_attr_val = {.ndev = ndev}; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; @@ -442,14 +464,16 @@ static int ib_cache_gid_find(struct ib_device *ib_dev, int ib_find_cached_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, + enum ib_gid_type gid_type, u8 port, struct net_device *ndev, u16 *index) { int local_index; struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; - unsigned long mask = GID_ATTR_FIND_MASK_GID; - struct ib_gid_attr val = {.ndev = ndev}; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; unsigned long flags; if (port < rdma_start_port(ib_dev) || @@ -607,15 +631,15 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, struct net_device *ndev, + unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; union ib_gid gid; struct ib_gid_attr gid_attr; + struct ib_gid_attr zattr_type = zattr; struct ib_gid_table *table; - int ix; - union ib_gid current_gid; - struct ib_gid_attr current_gid_attr = {}; + unsigned int gid_type; table = ports_table[port - rdma_start_port(ib_dev)]; @@ -623,55 +647,82 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; - mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); - ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT, NULL); - - /* Coudn't find default GID location */ - WARN_ON(ix < 0); - - if (!__ib_cache_gid_get(ib_dev, port, ix, - ¤t_gid, ¤t_gid_attr) && - mode == IB_CACHE_GID_DEFAULT_MODE_SET && - !memcmp(&gid, ¤t_gid, sizeof(gid)) && - !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) - goto unlock; - - if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || - memcmp(¤t_gid_attr, &zattr, - sizeof(current_gid_attr))) { - if (del_gid(ib_dev, port, table, ix, true)) { - pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", - ix, gid.raw); - goto unlock; - } else { - dispatch_gid_change_event(ib_dev, port); + for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { + int ix; + union ib_gid current_gid; + struct ib_gid_attr current_gid_attr = {}; + + if (1UL << gid_type & ~gid_type_mask) + continue; + + gid_attr.gid_type = gid_type; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + ix = find_gid(table, NULL, &gid_attr, true, + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT, + NULL); + + /* Coudn't find default GID location */ + WARN_ON(ix < 0); + + zattr_type.gid_type = gid_type; + + if (!__ib_cache_gid_get(ib_dev, port, ix, + ¤t_gid, ¤t_gid_attr) && + mode == IB_CACHE_GID_DEFAULT_MODE_SET && + !memcmp(&gid, ¤t_gid, sizeof(gid)) && + !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) + goto release; + + if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || + memcmp(¤t_gid_attr, &zattr_type, + sizeof(current_gid_attr))) { + if (del_gid(ib_dev, port, table, ix, true)) { + pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", + ix, gid.raw); + goto release; + } else { + dispatch_gid_change_event(ib_dev, port); + } } - } - if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { - if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) { - pr_warn("ib_cache_gid: unable to add default gid %pI6\n", - gid.raw); - } else { - dispatch_gid_change_event(ib_dev, port); + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { + if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) + pr_warn("ib_cache_gid: unable to add default gid %pI6\n", + gid.raw); + else + dispatch_gid_change_event(ib_dev, port); } - } -unlock: - if (current_gid_attr.ndev) - dev_put(current_gid_attr.ndev); - write_unlock_irq(&table->rwlock); - mutex_unlock(&table->lock); +release: + if (current_gid_attr.ndev) + dev_put(current_gid_attr.ndev); + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + } } static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { - if (rdma_protocol_roce(ib_dev, port)) { - struct ib_gid_table_entry *entry = &table->data_vec[0]; + unsigned int i; + unsigned long roce_gid_type_mask; + unsigned int num_default_gids; + unsigned int current_gid = 0; + + roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + num_default_gids = hweight_long(roce_gid_type_mask); + for (i = 0; i < num_default_gids && i < table->sz; i++) { + struct ib_gid_table_entry *entry = + &table->data_vec[i]; entry->props |= GID_TABLE_ENTRY_DEFAULT; + current_gid = find_next_bit(&roce_gid_type_mask, + BITS_PER_LONG, + current_gid); + entry->attr.gid_type = current_gid++; } return 0; @@ -794,11 +845,12 @@ EXPORT_SYMBOL(ib_get_cached_gid); int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index) { - return ib_cache_gid_find(device, gid, ndev, port_num, index); + return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); } EXPORT_SYMBOL(ib_find_cached_gid); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 7fa2b94..d93b82f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -364,7 +364,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - ndev, &p, NULL)) { + IB_GID_TYPE_IB, ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f8dfc63..2637ebf 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -456,7 +456,8 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if (dev_type == ARPHRD_ETHER) ndev = dev_get_by_index(&init_net, bound_if_index); - ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL); + ret = ib_find_cached_gid_by_port(device, gid, IB_GID_TYPE_IB, port, + ndev, NULL); if (ndev) dev_put(ndev); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 5cf6eb7..d531f91 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -70,8 +70,11 @@ enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_DELETE }; +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); + void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, struct net_device *ndev, + unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode); int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, @@ -87,6 +90,7 @@ int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); int roce_rescan_device(struct ib_device *ib_dev); +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index cce7a65..00da80e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -815,26 +815,31 @@ EXPORT_SYMBOL(ib_modify_port); * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: Type of GID. * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - struct net_device *ndev, u8 *port_num, u16 *index) + enum ib_gid_type gid_type, struct net_device *ndev, + u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { if (rdma_cap_roce_gid_table(device, port)) { - if (!ib_find_cached_gid_by_port(device, gid, port, + if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, index)) { *port_num = port; return 0; } } + if (gid_type != IB_GID_TYPE_IB) + continue; + for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index bb6685f..6911ae6 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -729,7 +729,7 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, + ret = ib_find_cached_gid(device, &rec->port_gid, IB_GID_TYPE_IB, NULL, &p, &gid_index); if (ret) return ret; diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 178f984..61c27a7 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -67,17 +67,52 @@ struct netdev_event_work { struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; +static const struct { + bool (*is_supported)(const struct ib_device *device, u8 port_num); + enum ib_gid_type gid_type; +} PORT_CAP_TO_GID_TYPE[] = { + {rdma_protocol_roce, IB_GID_TYPE_ROCE}, +}; + +#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) + +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) +{ + int i; + unsigned int ret_flags = 0; + + if (!rdma_protocol_roce(ib_dev, port)) + return 1UL << IB_GID_TYPE_IB; + + for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) + if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) + ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; + + return ret_flags; +} +EXPORT_SYMBOL(roce_gid_type_mask_support); + static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { - switch (gid_op) { - case GID_ADD: - ib_cache_gid_add(ib_dev, port, gid, gid_attr); - break; - case GID_DEL: - ib_cache_gid_del(ib_dev, port, gid, gid_attr); - break; + int i; + unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + for (i = 0; i < IB_GID_TYPE_SIZE; i++) { + if ((1UL << i) & gid_type_mask) { + gid_attr->gid_type = i; + switch (gid_op) { + case GID_ADD: + ib_cache_gid_add(ib_dev, port, + gid, gid_attr); + break; + case GID_DEL: + ib_cache_gid_del(ib_dev, port, + gid, gid_attr); + break; + } + } } } @@ -203,6 +238,8 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev, u8 port, struct net_device *event_ndev, struct net_device *rdma_ndev) { + unsigned long gid_type_mask; + rcu_read_lock(); if (!rdma_ndev || ((rdma_ndev != event_ndev && @@ -215,7 +252,9 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev, } rcu_read_unlock(); - ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } @@ -237,9 +276,14 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, if (is_upper_dev_rcu(rdma_ndev, event_ndev) && is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE) { + unsigned long gid_type_mask; + rcu_read_unlock(); + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } else { rcu_read_unlock(); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a95a32b..270faaa 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1014,8 +1014,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num, - &gid_index); + ret = ib_find_cached_gid(device, &rec->sgid, rec->gid_type, ndev, + &port_num, &gid_index); if (ret) { if (ndev) dev_put(ndev); @@ -1157,6 +1157,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, mad->data, &rec); rec.net = NULL; rec.ifindex = 0; + rec.gid_type = IB_GID_TYPE_IB; memset(rec.dmac, 0, ETH_ALEN); query->callback(status, &rec, query->context); } else diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 7d2f14c..af020f8 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -144,5 +144,6 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, memset(dst->dmac, 0, sizeof(dst->dmac)); dst->net = NULL; dst->ifindex = 0; + dst->gid_type = IB_GID_TYPE_IB; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 7617d94..e397d8b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -381,6 +381,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (!rdma_cap_eth_ah(device, port_num)) { ret = ib_find_cached_gid_by_port(device, &grh->dgid, + IB_GID_TYPE_IB, port_num, NULL, &gid_index); if (ret) -- cgit v1.1 From cb57bb849effcaa83addd739595d3dea3a5905fb Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:48 +0200 Subject: IB/cm: Use the source GID index type Previosuly, cm and cma modules supported only IB and RoCE v1 GID type. In order to support multiple GID types, the gid_type is passed to cm_init_av_by_path and stored in the path record. The rdma cm client would use a default GID type that will be saved in rdma_id_private. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 25 ++++++++++++++++++++----- drivers/infiniband/core/cma.c | 2 ++ 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index d93b82f..d883a32 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -364,7 +364,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - IB_GID_TYPE_IB, ndev, &p, NULL)) { + path->gid_type, ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } @@ -1600,6 +1600,8 @@ static int cm_req_handler(struct cm_work *work) struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; + union ib_gid gid; + struct ib_gid_attr gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1639,11 +1641,24 @@ static int cm_req_handler(struct cm_work *work) cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + ret = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, + cm_id_priv->av.ah_attr.grh.sgid_index, + &gid, &gid_attr); + if (!ret) { + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + work->path[0].gid_type = gid_attr.gid_type; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + } if (ret) { - ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, &work->path[0].sgid, - NULL); + int err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + &gid_attr); + if (!err && gid_attr.ndev) + dev_put(gid_attr.ndev); + work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2637ebf..446323a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -228,6 +228,7 @@ struct rdma_id_private { u8 tos; u8 reuseaddr; u8 afonly; + enum ib_gid_type gid_type; }; struct cma_multicast { @@ -2314,6 +2315,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); route->path_rec->net = &init_net; route->path_rec->ifindex = addr->dev_addr.bound_dev_if; + route->path_rec->gid_type = id_priv->gid_type; } if (!ndev) { ret = -ENODEV; -- cgit v1.1 From 470be516a226e851d62a8d3d31dc162500b84487 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:49 +0200 Subject: IB/core: Add gid attributes to sysfs This patch set adds attributes of net device and gid type to each GID in the GID table. Users that use verbs directly need to specify the GID index. Since the same GID could have different types or associated net devices, users should have the ability to query the associated GID attributes. Adding these attributes to sysfs. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 184 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 1d5b4b0..91847d38 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -37,12 +37,22 @@ #include #include #include +#include #include +struct ib_port; + +struct gid_attr_group { + struct ib_port *port; + struct kobject kobj; + struct attribute_group ndev; + struct attribute_group type; +}; struct ib_port { struct kobject kobj; struct ib_device *ibdev; + struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group pkey_group; u8 port_num; @@ -84,6 +94,24 @@ static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show }; +static ssize_t gid_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct gid_attr_group, + kobj)->port; + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops gid_attr_sysfs_ops = { + .show = gid_attr_show +}; + static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, char *buf) { @@ -281,6 +309,46 @@ static struct attribute *port_default_attrs[] = { NULL }; +static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) +{ + if (!gid_attr->ndev) + return -EINVAL; + + return sprintf(buf, "%s\n", gid_attr->ndev->name); +} + +static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf) +{ + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); +} + +static ssize_t _show_port_gid_attr(struct ib_port *p, + struct port_attribute *attr, + char *buf, + size_t (*print)(struct ib_gid_attr *gid_attr, + char *buf)) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + struct ib_gid_attr gid_attr = {}; + ssize_t ret; + va_list args; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, + &gid_attr); + if (ret) + goto err; + + ret = print(&gid_attr, buf); + +err: + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + va_end(args); + return ret; +} + static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -296,6 +364,19 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, return sprintf(buf, "%pI6\n", gid.raw); } +static ssize_t show_port_gid_attr_ndev(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_ndev); +} + +static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, + struct port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_gid_type); +} + static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -451,12 +532,41 @@ static void ib_port_release(struct kobject *kobj) kfree(p); } +static void ib_port_gid_attr_release(struct kobject *kobj) +{ + struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, + kobj); + struct attribute *a; + int i; + + if (g->ndev.attrs) { + for (i = 0; (a = g->ndev.attrs[i]); ++i) + kfree(a); + + kfree(g->ndev.attrs); + } + + if (g->type.attrs) { + for (i = 0; (a = g->type.attrs[i]); ++i) + kfree(a); + + kfree(g->type.attrs); + } + + kfree(g); +} + static struct kobj_type port_type = { .release = ib_port_release, .sysfs_ops = &port_sysfs_ops, .default_attrs = port_default_attrs }; +static struct kobj_type gid_attr_type = { + .sysfs_ops = &gid_attr_sysfs_ops, + .release = ib_port_gid_attr_release +}; + static struct attribute ** alloc_group_attrs(ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf), @@ -528,9 +638,23 @@ static int add_port(struct ib_device *device, int port_num, return ret; } + p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); + if (!p->gid_attr_group) { + ret = -ENOMEM; + goto err_put; + } + + p->gid_attr_group->port = p; + ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, + &p->kobj, "gid_attrs"); + if (ret) { + kfree(p->gid_attr_group); + goto err_put; + } + ret = sysfs_create_group(&p->kobj, &pma_group); if (ret) - goto err_put; + goto err_put_gid_attrs; p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); @@ -543,12 +667,38 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_free_gid; + p->gid_attr_group->ndev.name = "ndevs"; + p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, + attr.gid_tbl_len); + if (!p->gid_attr_group->ndev.attrs) { + ret = -ENOMEM; + goto err_remove_gid; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + if (ret) + goto err_free_gid_ndev; + + p->gid_attr_group->type.name = "types"; + p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, + attr.gid_tbl_len); + if (!p->gid_attr_group->type.attrs) { + ret = -ENOMEM; + goto err_remove_gid_ndev; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + if (ret) + goto err_free_gid_type; + p->pkey_group.name = "pkeys"; p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, attr.pkey_tbl_len); if (!p->pkey_group.attrs) { ret = -ENOMEM; - goto err_remove_gid; + goto err_remove_gid_type; } ret = sysfs_create_group(&p->kobj, &p->pkey_group); @@ -576,6 +726,28 @@ err_free_pkey: kfree(p->pkey_group.attrs); p->pkey_group.attrs = NULL; +err_remove_gid_type: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + +err_free_gid_type: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->type.attrs[i]); + + kfree(p->gid_attr_group->type.attrs); + p->gid_attr_group->type.attrs = NULL; + +err_remove_gid_ndev: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + +err_free_gid_ndev: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->ndev.attrs[i]); + + kfree(p->gid_attr_group->ndev.attrs); + p->gid_attr_group->ndev.attrs = NULL; + err_remove_gid: sysfs_remove_group(&p->kobj, &p->gid_group); @@ -589,6 +761,9 @@ err_free_gid: err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); +err_put_gid_attrs: + kobject_put(&p->gid_attr_group->kobj); + err_put: kobject_put(&p->kobj); return ret; @@ -797,6 +972,11 @@ static void free_port_list_attributes(struct ib_device *device) sysfs_remove_group(p, &pma_group); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->ndev); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->type); + kobject_put(&port->gid_attr_group->kobj); kobject_put(p); } -- cgit v1.1 From 7766a99fdcd30c78fc8299db9102e3624232007c Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:50 +0200 Subject: IB/core: Add ROCE_UDP_ENCAP (RoCE V2) type Adding RoCE v2 GID type and port type. Vendors which support this type will get their GID table populated with RoCE v2 GIDs automatically. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 1 + drivers/infiniband/core/roce_gid_mgmt.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 06e47e1..4a2968b 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -128,6 +128,7 @@ static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) static const char * const gid_type_str[] = { [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 61c27a7..1e3673f 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -71,7 +71,8 @@ static const struct { bool (*is_supported)(const struct ib_device *device, u8 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { - {rdma_protocol_roce, IB_GID_TYPE_ROCE}, + {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, + {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) -- cgit v1.1 From c865f24628b9310e1815d59f723a34ea3df4890f Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Wed, 23 Dec 2015 14:56:51 +0200 Subject: IB/core: Add rdma_network_type to wc Providers should tell IB core the wc's network type. This is used in order to search for the proper GID in the GID table. When using HCAs that can't provide this info, IB core tries to deep examine the packet and extract the GID type by itself. We choose sgid_index and type from all the matching entries in RDMA-CM based on hint from the IP stack and we set hop_limit for the IP packet based on above hint from IP stack. Signed-off-by: Matan Barak Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 14 +++++ drivers/infiniband/core/cma.c | 11 +++- drivers/infiniband/core/verbs.c | 123 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 142 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 34b1ada..6e35299 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -256,6 +256,12 @@ static int addr4_resolve(struct sockaddr_in *src_in, goto put; } + /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't + * routable) and we could set the network type accordingly. + */ + if (rt->rt_uses_gateway) + addr->network = RDMA_NETWORK_IPV4; + ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr); put: ip_rt_put(rt); @@ -270,6 +276,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, { struct flowi6 fl6; struct dst_entry *dst; + struct rt6_info *rt; int ret; memset(&fl6, 0, sizeof fl6); @@ -281,6 +288,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, if ((ret = dst->error)) goto put; + rt = (struct rt6_info *)dst; if (ipv6_addr_any(&fl6.saddr)) { ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev, &fl6.daddr, 0, &fl6.saddr); @@ -304,6 +312,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, goto put; } + /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't + * routable) and we could set the network type accordingly. + */ + if (rt->rt6i_flags & RTF_GATEWAY) + addr->network = RDMA_NETWORK_IPV6; + ret = dst_fetch_ha(dst, addr, &fl6.daddr); put: dst_release(dst); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 446323a..0a29d60 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2291,6 +2291,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; + enum ib_gid_type network_gid_type; struct cma_work *work; int ret; struct net_device *ndev = NULL; @@ -2329,7 +2330,15 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - route->path_rec->hop_limit = 1; + /* Use the hint from IP Stack to select GID Type */ + network_gid_type = ib_network_to_gid_type(addr->dev_addr.network); + if (addr->dev_addr.network != RDMA_NETWORK_IB) { + route->path_rec->gid_type = network_gid_type; + /* TODO: get the hoplimit from the inet/inet6 device */ + route->path_rec->hop_limit = IPV6_DEFAULT_HOPLIMIT; + } else { + route->path_rec->hop_limit = 1; + } route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e397d8b..b25e5fb 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -305,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) } EXPORT_SYMBOL(ib_create_ah); +static int ib_get_header_version(const union rdma_network_hdr *hdr) +{ + const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh; + struct iphdr ip4h_checked; + const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh; + + /* If it's IPv6, the version must be 6, otherwise, the first + * 20 bytes (before the IPv4 header) are garbled. + */ + if (ip6h->version != 6) + return (ip4h->version == 4) ? 4 : 0; + /* version may be 6 or 4 because the first 20 bytes could be garbled */ + + /* RoCE v2 requires no options, thus header length + * must be 5 words + */ + if (ip4h->ihl != 5) + return 6; + + /* Verify checksum. + * We can't write on scattered buffers so we need to copy to + * temp buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.check = 0; + ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->check == ip4h_checked.check) + return 4; + return 6; +} + +static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, + u8 port_num, + const struct ib_grh *grh) +{ + int grh_version; + + if (rdma_protocol_ib(device, port_num)) + return RDMA_NETWORK_IB; + + grh_version = ib_get_header_version((union rdma_network_hdr *)grh); + + if (grh_version == 4) + return RDMA_NETWORK_IPV4; + + if (grh->next_hdr == IPPROTO_UDP) + return RDMA_NETWORK_IPV6; + + return RDMA_NETWORK_ROCE_V1; +} + struct find_gid_index_context { u16 vlan_id; + enum ib_gid_type gid_type; }; static bool find_gid_index(const union ib_gid *gid, @@ -316,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid, struct find_gid_index_context *ctx = (struct find_gid_index_context *)context; + if (ctx->gid_type != gid_attr->gid_type) + return false; + if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) || (is_vlan_dev(gid_attr->ndev) && vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id)) @@ -326,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid, static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, u16 vlan_id, const union ib_gid *sgid, + enum ib_gid_type gid_type, u16 *gid_index) { - struct find_gid_index_context context = {.vlan_id = vlan_id}; + struct find_gid_index_context context = {.vlan_id = vlan_id, + .gid_type = gid_type}; return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, &context, gid_index); } +static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr, + enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid) +{ + struct sockaddr_in src_in; + struct sockaddr_in dst_in; + __be32 src_saddr, dst_saddr; + + if (!sgid || !dgid) + return -EINVAL; + + if (net_type == RDMA_NETWORK_IPV4) { + memcpy(&src_in.sin_addr.s_addr, + &hdr->roce4grh.saddr, 4); + memcpy(&dst_in.sin_addr.s_addr, + &hdr->roce4grh.daddr, 4); + src_saddr = src_in.sin_addr.s_addr; + dst_saddr = dst_in.sin_addr.s_addr; + ipv6_addr_set_v4mapped(src_saddr, + (struct in6_addr *)sgid); + ipv6_addr_set_v4mapped(dst_saddr, + (struct in6_addr *)dgid); + return 0; + } else if (net_type == RDMA_NETWORK_IPV6 || + net_type == RDMA_NETWORK_IB) { + *dgid = hdr->ibgrh.dgid; + *sgid = hdr->ibgrh.sgid; + return 0; + } else { + return -EINVAL; + } +} + int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct ib_ah_attr *ah_attr) @@ -341,9 +432,25 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, u32 flow_class; u16 gid_index; int ret; + enum rdma_network_type net_type = RDMA_NETWORK_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + union ib_gid dgid; + union ib_gid sgid; memset(ah_attr, 0, sizeof *ah_attr); if (rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) + net_type = wc->network_hdr_type; + else + net_type = ib_get_net_type_by_grh(device, port_num, grh); + gid_type = ib_network_to_gid_type(net_type); + } + ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type, + &sgid, &dgid); + if (ret) + return ret; + + if (rdma_protocol_roce(device, port_num)) { u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; @@ -352,7 +459,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (!(wc->wc_flags & IB_WC_WITH_SMAC) || !(wc->wc_flags & IB_WC_WITH_VLAN)) { - ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, + ret = rdma_addr_find_dmac_by_grh(&dgid, &sgid, ah_attr->dmac, wc->wc_flags & IB_WC_WITH_VLAN ? NULL : &vlan_id, @@ -362,7 +469,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, } ret = get_sgid_index_from_eth(device, port_num, vlan_id, - &grh->dgid, &gid_index); + &dgid, gid_type, &gid_index); if (ret) return ret; @@ -377,10 +484,10 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (wc->wc_flags & IB_WC_GRH) { ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = grh->sgid; + ah_attr->grh.dgid = sgid; if (!rdma_cap_eth_ah(device, port_num)) { - ret = ib_find_cached_gid_by_port(device, &grh->dgid, + ret = ib_find_cached_gid_by_port(device, &dgid, IB_GID_TYPE_IB, port_num, NULL, &gid_index); @@ -1020,6 +1127,12 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, ret = -ENXIO; goto out; } + if (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + /* TODO: get the hoplimit from the inet/inet6 + * device + */ + qp_attr->ah_attr.grh.hop_limit = + IPV6_DEFAULT_HOPLIMIT; ifindex = sgid_attr.ndev->ifindex; -- cgit v1.1 From 6020d7e5004cc8591d61d449e9285a6f08279541 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:52 +0200 Subject: IB/core: Move rdma_is_upper_dev_rcu to header file In order to validate the route, we need an easy way to check if a net-device belongs to our RDMA device. Move this helper function to a header file in order to make this check easier. Signed-off-by: Matan Barak Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 13 +++++++++++++ drivers/infiniband/core/roce_gid_mgmt.c | 20 ++++---------------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index d531f91..3b250a2 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -96,4 +96,17 @@ int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); +static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, + struct net_device *upper) +{ + struct net_device *_upper = NULL; + struct list_head *iter; + + netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) + if (_upper == upper) + break; + + return _upper == upper; +} + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 1e3673f..06556c3 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -139,18 +139,6 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de return BONDING_SLAVE_STATE_NA; } -static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) -{ - struct net_device *_upper = NULL; - struct list_head *iter; - - netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) - if (_upper == upper) - break; - - return _upper == upper; -} - #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, @@ -168,7 +156,7 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, if (!real_dev) real_dev = event_ndev; - res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) && + res = ((rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); @@ -214,7 +202,7 @@ static int upper_device_filter(struct ib_device *ib_dev, u8 port, return 1; rcu_read_lock(); - res = is_upper_dev_rcu(rdma_ndev, event_ndev); + res = rdma_is_upper_dev_rcu(rdma_ndev, event_ndev); rcu_read_unlock(); return res; @@ -244,7 +232,7 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev, rcu_read_lock(); if (!rdma_ndev || ((rdma_ndev != event_ndev && - !is_upper_dev_rcu(rdma_ndev, event_ndev)) || + !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, netdev_master_upper_dev_get_rcu(rdma_ndev)) == BONDING_SLAVE_STATE_INACTIVE)) { @@ -274,7 +262,7 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, rcu_read_lock(); - if (is_upper_dev_rcu(rdma_ndev, event_ndev) && + if (rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) && is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE) { unsigned long gid_type_mask; -- cgit v1.1 From 200298326b276d8dbeff204f7d407432100d9963 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:53 +0200 Subject: IB/core: Validate route when we init ah In order to make sure API users don't try to use SGIDs which don't conform to the routing table, validate the route before searching the RoCE GID table. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 175 ++++++++++++++++++++++++++----------- drivers/infiniband/core/cm.c | 10 ++- drivers/infiniband/core/cma.c | 30 ++++++- drivers/infiniband/core/sa_query.c | 75 ++++++++++++++-- drivers/infiniband/core/verbs.c | 48 +++++++--- 5 files changed, 262 insertions(+), 76 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 6e35299..0b5f245 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -121,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, } EXPORT_SYMBOL(rdma_copy_addr); -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, u16 *vlan_id) { struct net_device *dev; @@ -139,7 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, switch (addr->sa_family) { case AF_INET: dev = ip_dev_find(dev_addr->net, - ((struct sockaddr_in *) addr)->sin_addr.s_addr); + ((const struct sockaddr_in *)addr)->sin_addr.s_addr); if (!dev) return ret; @@ -154,7 +155,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, rcu_read_lock(); for_each_netdev_rcu(dev_addr->net, dev) { if (ipv6_chk_addr(dev_addr->net, - &((struct sockaddr_in6 *) addr)->sin6_addr, + &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); if (vlan_id) @@ -198,7 +199,8 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr) +static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const void *daddr) { struct neighbour *n; int ret; @@ -222,8 +224,9 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, v } static int addr4_resolve(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr, + struct rtable **prt) { __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; @@ -243,36 +246,23 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - if (rt->dst.dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } - - /* If the device does ARP internally, return 'done' */ - if (rt->dst.dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, rt->dst.dev, NULL); - goto put; - } - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't * routable) and we could set the network type accordingly. */ if (rt->rt_uses_gateway) addr->network = RDMA_NETWORK_IPV4; - ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr); -put: - ip_rt_put(rt); + *prt = rt; + return 0; out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) { struct flowi6 fl6; struct dst_entry *dst; @@ -299,49 +289,109 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, src_in->sin6_addr = fl6.saddr; } - if (dst->dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } - - /* If the device does ARP internally, return 'done' */ - if (dst->dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, dst->dev, NULL); - goto put; - } - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't * routable) and we could set the network type accordingly. */ if (rt->rt6i_flags & RTF_GATEWAY) addr->network = RDMA_NETWORK_IPV6; - ret = dst_fetch_ha(dst, addr, &fl6.daddr); + *pdst = dst; + return 0; put: dst_release(dst); return ret; } #else static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) { return -EADDRNOTAVAIL; } #endif +static int addr_resolve_neigh(struct dst_entry *dst, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr) +{ + if (dst->dev->flags & IFF_LOOPBACK) { + int ret; + + ret = rdma_translate_ip(dst_in, addr, NULL); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, + MAX_ADDR_LEN); + + return ret; + } + + /* If the device doesn't do ARP internally */ + if (!(dst->dev->flags & IFF_NOARP)) { + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + + return dst_fetch_ha(dst, addr, + dst_in->sa_family == AF_INET ? + (const void *)&dst_in4->sin_addr.s_addr : + (const void *)&dst_in6->sin6_addr); + } + + return rdma_copy_addr(addr, dst->dev, NULL); +} + static int addr_resolve(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr, + bool resolve_neigh) { + struct net_device *ndev; + struct dst_entry *dst; + int ret; + if (src_in->sa_family == AF_INET) { - return addr4_resolve((struct sockaddr_in *) src_in, - (struct sockaddr_in *) dst_in, addr); - } else - return addr6_resolve((struct sockaddr_in6 *) src_in, - (struct sockaddr_in6 *) dst_in, addr); + struct rtable *rt = NULL; + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + + ret = addr4_resolve((struct sockaddr_in *)src_in, + dst_in4, addr, &rt); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(&rt->dst, dst_in, addr); + + ndev = rt->dst.dev; + dev_hold(ndev); + + ip_rt_put(rt); + } else { + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + + ret = addr6_resolve((struct sockaddr_in6 *)src_in, + dst_in6, addr, + &dst); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr); + + ndev = dst->dev; + dev_hold(ndev); + + dst_release(dst); + } + + addr->bound_dev_if = ndev->ifindex; + addr->net = dev_net(ndev); + dev_put(ndev); + + return ret; } static void process_req(struct work_struct *work) @@ -357,7 +407,8 @@ static void process_req(struct work_struct *work) if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve(src_in, dst_in, req->addr); + req->status = addr_resolve(src_in, dst_in, req->addr, + true); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -417,7 +468,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->client = client; atomic_inc(&client->refcount); - req->status = addr_resolve(src_in, dst_in, addr); + req->status = addr_resolve(src_in, dst_in, addr, true); switch (req->status) { case 0: req->timeout = jiffies; @@ -439,6 +490,25 @@ err: } EXPORT_SYMBOL(rdma_resolve_ip); +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr) +{ + struct sockaddr_storage ssrc_addr = {}; + struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + + if (src_addr->sa_family != dst_addr->sa_family) + return -EINVAL; + + if (src_addr) + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + else + src_in->sa_family = dst_addr->sa_family; + + return addr_resolve(src_in, dst_addr, addr, false); +} +EXPORT_SYMBOL(rdma_resolve_ip_route); + void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; @@ -471,7 +541,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr, } int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int if_index) + u8 *dmac, u16 *vlan_id, int *if_index) { int ret = 0; struct rdma_dev_addr dev_addr; @@ -489,7 +559,8 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = if_index; + if (if_index) + dev_addr.bound_dev_if = *if_index; dev_addr.net = &init_net; ctx.addr = &dev_addr; @@ -505,6 +576,8 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); if (!dev) return -ENODEV; + if (if_index) + *if_index = dev_addr.bound_dev_if; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index d883a32..e3a95d1 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1646,8 +1646,11 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv->av.ah_attr.grh.sgid_index, &gid, &gid_attr); if (!ret) { - if (gid_attr.ndev) + if (gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->ifindex; + work->path[0].net = dev_net(gid_attr.ndev); dev_put(gid_attr.ndev); + } work->path[0].gid_type = gid_attr.gid_type; ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); } @@ -1656,8 +1659,11 @@ static int cm_req_handler(struct cm_work *work) work->port->port_num, 0, &work->path[0].sgid, &gid_attr); - if (!err && gid_attr.ndev) + if (!err && gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->ifindex; + work->path[0].net = dev_net(gid_attr.ndev); dev_put(gid_attr.ndev); + } work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0a29d60..fce11df 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -454,8 +454,20 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER) + if (dev_type == ARPHRD_ETHER) { ndev = dev_get_by_index(&init_net, bound_if_index); + if (ndev && ndev->flags & IFF_LOOPBACK) { + pr_info("detected loopback device\n"); + dev_put(ndev); + + if (!device->get_netdev) + return -EOPNOTSUPP; + + ndev = device->get_netdev(device, port); + if (!ndev) + return -ENODEV; + } + } ret = ib_find_cached_gid_by_port(device, gid, IB_GID_TYPE_IB, port, ndev, NULL); @@ -2314,8 +2326,22 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) if (addr->dev_addr.bound_dev_if) { ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (!ndev) + return -ENODEV; + + if (ndev->flags & IFF_LOOPBACK) { + dev_put(ndev); + if (!id_priv->id.device->get_netdev) + return -EOPNOTSUPP; + + ndev = id_priv->id.device->get_netdev(id_priv->id.device, + id_priv->id.port_num); + if (!ndev) + return -ENODEV; + } + route->path_rec->net = &init_net; - route->path_rec->ifindex = addr->dev_addr.bound_dev_if; + route->path_rec->ifindex = ndev->ifindex; route->path_rec->gid_type = id_priv->gid_type; } if (!ndev) { diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 270faaa..e364a42 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -49,7 +49,9 @@ #include #include #include +#include #include "sa.h" +#include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand subnet administration query support"); @@ -996,7 +998,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, { int ret; u16 gid_index; - int force_grh; + int use_roce; + struct net_device *ndev = NULL; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); @@ -1006,16 +1009,71 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - force_grh = rdma_cap_eth_ah(device, port_num); + use_roce = rdma_cap_eth_ah(device, port_num); + + if (use_roce) { + struct net_device *idev; + struct net_device *resolved_dev; + struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex, + .net = rec->net ? rec->net : + &init_net}; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); + + /* validate the route */ + ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, + &dgid_addr._sockaddr, &dev_addr); + if (ret) + return ret; - if (rec->hop_limit > 1 || force_grh) { - struct net_device *ndev = ib_get_ndev_from_path(rec); + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return -EINVAL; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + resolved_dev = dev_get_by_index(dev_addr.net, + dev_addr.bound_dev_if); + if (resolved_dev->flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + ndev = ib_get_ndev_from_path(rec); + rcu_read_lock(); + if ((ndev && ndev != resolved_dev) || + (resolved_dev != idev && + !rdma_is_upper_dev_rcu(idev, resolved_dev))) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) { + if (ndev) + dev_put(ndev); + return ret; + } + } + if (rec->hop_limit > 1 || use_roce) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, rec->gid_type, ndev, - &port_num, &gid_index); + ret = ib_find_cached_gid_by_port(device, &rec->sgid, + rec->gid_type, port_num, ndev, + &gid_index); if (ret) { if (ndev) dev_put(ndev); @@ -1029,9 +1087,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, if (ndev) dev_put(ndev); } - if (force_grh) { + + if (use_roce) memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); - } + return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b25e5fb..063210b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -451,30 +451,52 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, return ret; if (rdma_protocol_roce(device, port_num)) { + int if_index = 0; u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; + struct net_device *idev; + struct net_device *resolved_dev; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (!(wc->wc_flags & IB_WC_WITH_SMAC) || - !(wc->wc_flags & IB_WC_WITH_VLAN)) { - ret = rdma_addr_find_dmac_by_grh(&dgid, &sgid, - ah_attr->dmac, - wc->wc_flags & IB_WC_WITH_VLAN ? - NULL : &vlan_id, - 0); - if (ret) - return ret; + if (!device->get_netdev) + return -EOPNOTSUPP; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + ret = rdma_addr_find_dmac_by_grh(&dgid, &sgid, + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, + &if_index); + if (ret) { + dev_put(idev); + return ret; } + resolved_dev = dev_get_by_index(&init_net, if_index); + if (resolved_dev->flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + rcu_read_lock(); + if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, + resolved_dev)) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) + return ret; + ret = get_sgid_index_from_eth(device, port_num, vlan_id, &dgid, gid_type, &gid_index); if (ret) return ret; - - if (wc->wc_flags & IB_WC_WITH_SMAC) - memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); } ah_attr->dlid = wc->slid; @@ -1139,7 +1161,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, qp_attr->ah_attr.dmac, - NULL, ifindex); + NULL, &ifindex); dev_put(sgid_attr.ndev); } -- cgit v1.1 From 218a773f7632d8553638c76d3a5a8c77e82ccea1 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:54 +0200 Subject: IB/rdma_cm: Add wrapper for cma reference count Currently, cma users can't increase or decrease the cma reference count. This is necassary when setting cma attributes (like the default GID type) in order to avoid use-after-free errors. Adding cma_ref_dev and cma_deref_dev APIs. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 11 +++++++++-- drivers/infiniband/core/core_priv.h | 4 ++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index fce11df..322f1c6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -60,6 +60,8 @@ #include #include +#include "core_priv.h" + MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); @@ -185,6 +187,11 @@ enum { CMA_OPTION_AFONLY, }; +void cma_ref_dev(struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); +} + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -339,7 +346,7 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { - atomic_inc(&cma_dev->refcount); + cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = @@ -347,7 +354,7 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, list_add_tail(&id_priv->list, &cma_dev->id_list); } -static inline void cma_deref_dev(struct cma_device *cma_dev) +void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 3b250a2..1945b4e 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,10 @@ #include +struct cma_device; +void cma_ref_dev(struct cma_device *cma_dev); +void cma_deref_dev(struct cma_device *cma_dev); + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); -- cgit v1.1 From 045959db65c67d7189dc89ecddb5fa10aafa449d Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 23 Dec 2015 14:56:55 +0200 Subject: IB/cma: Add configfs for rdma_cm Users would like to control the behaviour of rdma_cm. For example, old applications which don't set the required RoCE gid type could be executed on RoCE V2 network types. In order to support this configuration, we implement a configfs for rdma_cm. In order to use the configfs, one needs to mount it and mkdir inside rdma_cm directory. The patch adds support for a single configuration file, default_roce_mode. The mode can either be "IB/RoCE v1" or "RoCE v2". Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/Makefile | 2 + drivers/infiniband/core/cache.c | 24 +++ drivers/infiniband/core/cma.c | 108 ++++++++++- drivers/infiniband/core/cma_configfs.c | 322 +++++++++++++++++++++++++++++++++ drivers/infiniband/core/core_priv.h | 24 +++ 5 files changed, 473 insertions(+), 7 deletions(-) create mode 100644 drivers/infiniband/core/cma_configfs.c (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index ae48d8740..f818538 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -24,6 +24,8 @@ iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o rdma_cm-y := cma.o +rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o + rdma_ucm-y := ucma.o ib_addr-y := addr.o diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 4a2968b..92cadbd 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -140,6 +140,30 @@ const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) } EXPORT_SYMBOL(ib_cache_gid_type_str); +int ib_cache_gid_parse_type_str(const char *buf) +{ + unsigned int i; + size_t len; + int err = -EINVAL; + + len = strlen(buf); + if (len == 0) + return -EINVAL; + + if (buf[len - 1] == '\n') + len--; + + for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) + if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && + len == strlen(gid_type_str[i])) { + err = i; + break; + } + + return err; +} +EXPORT_SYMBOL(ib_cache_gid_parse_type_str); + /* This function expects that rwlock will be write locked in all * scenarios and that lock will be locked in sleep-able (RoCE) * scenarios. diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 322f1c6..75987b0 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -152,6 +152,7 @@ struct cma_device { struct completion comp; atomic_t refcount; struct list_head id_list; + enum ib_gid_type *default_gid_type; }; struct rdma_bind_list { @@ -192,6 +193,62 @@ void cma_ref_dev(struct cma_device *cma_dev) atomic_inc(&cma_dev->refcount); } +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie) +{ + struct cma_device *cma_dev; + struct cma_device *found_cma_dev = NULL; + + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) + if (filter(cma_dev->device, cookie)) { + found_cma_dev = cma_dev; + break; + } + + if (found_cma_dev) + cma_ref_dev(found_cma_dev); + mutex_unlock(&lock); + return found_cma_dev; +} + +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port) +{ + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type) +{ + unsigned long supported_gids; + + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); + + if (!(supported_gids & 1 << default_gid_type)) + return -EINVAL; + + cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = + default_gid_type; + + return 0; +} + +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) +{ + return cma_dev->device; +} + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -343,17 +400,27 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static void cma_attach_to_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) +static void _cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) { cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; + id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + _cma_attach_to_dev(id_priv, cma_dev); + id_priv->gid_type = + cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(cma_dev->device)]; +} + void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) @@ -449,6 +516,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a } static inline int cma_validate_port(struct ib_device *device, u8 port, + enum ib_gid_type gid_type, union ib_gid *gid, int dev_type, int bound_if_index) { @@ -474,9 +542,11 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if (!ndev) return -ENODEV; } + } else { + gid_type = IB_GID_TYPE_IB; } - ret = ib_find_cached_gid_by_port(device, gid, IB_GID_TYPE_IB, port, + ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, NULL); if (ndev) @@ -511,7 +581,10 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - ret = cma_validate_port(cma_dev->device, port, gidp, + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + listen_id_priv->gid_type, gidp, dev_addr->dev_type, dev_addr->bound_dev_if); if (!ret) { @@ -530,8 +603,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - ret = cma_validate_port(cma_dev->device, port, gidp, - dev_addr->dev_type, + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + cma_dev->default_gid_type[port - 1], + gidp, dev_addr->dev_type, dev_addr->bound_dev_if); if (!ret) { id_priv->id.port_num = port; @@ -2062,7 +2138,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); - cma_attach_to_dev(dev_id_priv, cma_dev); + _cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; @@ -3896,12 +3972,27 @@ static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; + unsigned int i; + unsigned long supported_gids = 0; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; cma_dev->device = device; + cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_gid_type), + GFP_KERNEL); + if (!cma_dev->default_gid_type) { + kfree(cma_dev); + return; + } + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + supported_gids = roce_gid_type_mask_support(device, i); + WARN_ON(!supported_gids); + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); + } init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); @@ -3981,6 +4072,7 @@ static void cma_remove_one(struct ib_device *device, void *client_data) mutex_unlock(&lock); cma_process_remove(cma_dev); + kfree(cma_dev->default_gid_type); kfree(cma_dev); } @@ -4114,6 +4206,7 @@ static int __init cma_init(void) if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n"); + cma_configfs_init(); return 0; @@ -4128,6 +4221,7 @@ err_wq: static void __exit cma_cleanup(void) { + cma_configfs_exit(); ibnl_remove_client(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c new file mode 100644 index 0000000..bd1d640 --- /dev/null +++ b/drivers/infiniband/core/cma_configfs.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "core_priv.h" + +struct cma_device; + +struct cma_dev_group; + +struct cma_dev_port_group { + unsigned int port_num; + struct cma_dev_group *cma_dev_group; + struct config_group group; +}; + +struct cma_dev_group { + char name[IB_DEVICE_NAME_MAX]; + struct config_group device_group; + struct config_group ports_group; + struct config_group *default_dev_group[2]; + struct config_group **default_ports_group; + struct cma_dev_port_group *ports; +}; + +static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) +{ + struct config_group *group; + + if (!item) + return NULL; + + group = container_of(item, struct config_group, cg_item); + return container_of(group, struct cma_dev_port_group, group); +} + +static bool filter_by_name(struct ib_device *ib_dev, void *cookie) +{ + return !strcmp(ib_dev->name, cookie); +} + +static int cma_configfs_params_get(struct config_item *item, + struct cma_device **pcma_dev, + struct cma_dev_port_group **pgroup) +{ + struct cma_dev_port_group *group = to_dev_port_group(item); + struct cma_device *cma_dev; + + if (!group) + return -ENODEV; + + cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + group->cma_dev_group->name); + if (!cma_dev) + return -ENODEV; + + *pcma_dev = cma_dev; + *pgroup = group; + + return 0; +} + +static void cma_configfs_params_put(struct cma_device *cma_dev) +{ + cma_deref_dev(cma_dev); +} + +static ssize_t default_roce_mode_show(struct config_item *item, + char *buf) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type; + ssize_t ret; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + gid_type = cma_get_default_gid_type(cma_dev, group->port_num); + cma_configfs_params_put(cma_dev); + + if (gid_type < 0) + return gid_type; + + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type)); +} + +static ssize_t default_roce_mode_store(struct config_item *item, + const char *buf, size_t count) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type = ib_cache_gid_parse_type_str(buf); + ssize_t ret; + + if (gid_type < 0) + return -EINVAL; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); + + cma_configfs_params_put(cma_dev); + + return !ret ? strnlen(buf, count) : ret; +} + +CONFIGFS_ATTR(, default_roce_mode); + +static struct configfs_attribute *cma_configfs_attributes[] = { + &attr_default_roce_mode, + NULL, +}; + +static struct config_item_type cma_port_group_type = { + .ct_attrs = cma_configfs_attributes, + .ct_owner = THIS_MODULE +}; + +static int make_cma_ports(struct cma_dev_group *cma_dev_group, + struct cma_device *cma_dev) +{ + struct ib_device *ibdev; + unsigned int i; + unsigned int ports_num; + struct cma_dev_port_group *ports; + int err; + + ibdev = cma_get_ib_dev(cma_dev); + + if (!ibdev) + return -ENODEV; + + ports_num = ibdev->phys_port_cnt; + ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports), + GFP_KERNEL); + + cma_dev_group->default_ports_group = kcalloc(ports_num + 1, + sizeof(*cma_dev_group->ports), + GFP_KERNEL); + + if (!ports || !cma_dev_group->default_ports_group) { + err = -ENOMEM; + goto free; + } + + for (i = 0; i < ports_num; i++) { + char port_str[10]; + + ports[i].port_num = i + 1; + snprintf(port_str, sizeof(port_str), "%u", i + 1); + ports[i].cma_dev_group = cma_dev_group; + config_group_init_type_name(&ports[i].group, + port_str, + &cma_port_group_type); + cma_dev_group->default_ports_group[i] = &ports[i].group; + } + cma_dev_group->default_ports_group[i] = NULL; + cma_dev_group->ports = ports; + + return 0; +free: + kfree(ports); + kfree(cma_dev_group->default_ports_group); + cma_dev_group->ports = NULL; + cma_dev_group->default_ports_group = NULL; + return err; +} + +static void release_cma_dev(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + device_group); + + kfree(cma_dev_group); +}; + +static void release_cma_ports_group(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + ports_group); + + kfree(cma_dev_group->ports); + kfree(cma_dev_group->default_ports_group); + cma_dev_group->ports = NULL; + cma_dev_group->default_ports_group = NULL; +}; + +static struct configfs_item_operations cma_ports_item_ops = { + .release = release_cma_ports_group +}; + +static struct config_item_type cma_ports_group_type = { + .ct_item_ops = &cma_ports_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct configfs_item_operations cma_device_item_ops = { + .release = release_cma_dev +}; + +static struct config_item_type cma_device_group_type = { + .ct_item_ops = &cma_device_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct config_group *make_cma_dev(struct config_group *group, + const char *name) +{ + int err = -ENODEV; + struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + (void *)name); + struct cma_dev_group *cma_dev_group = NULL; + + if (!cma_dev) + goto fail; + + cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL); + + if (!cma_dev_group) { + err = -ENOMEM; + goto fail; + } + + strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + + err = make_cma_ports(cma_dev_group, cma_dev); + if (err) + goto fail; + + cma_dev_group->ports_group.default_groups = + cma_dev_group->default_ports_group; + config_group_init_type_name(&cma_dev_group->ports_group, "ports", + &cma_ports_group_type); + + cma_dev_group->device_group.default_groups + = cma_dev_group->default_dev_group; + cma_dev_group->default_dev_group[0] = &cma_dev_group->ports_group; + cma_dev_group->default_dev_group[1] = NULL; + + config_group_init_type_name(&cma_dev_group->device_group, name, + &cma_device_group_type); + + cma_deref_dev(cma_dev); + return &cma_dev_group->device_group; + +fail: + if (cma_dev) + cma_deref_dev(cma_dev); + kfree(cma_dev_group); + return ERR_PTR(err); +} + +static struct configfs_group_operations cma_subsys_group_ops = { + .make_group = make_cma_dev, +}; + +static struct config_item_type cma_subsys_type = { + .ct_group_ops = &cma_subsys_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem cma_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "rdma_cm", + .ci_type = &cma_subsys_type, + }, + }, +}; + +int __init cma_configfs_init(void) +{ + config_group_init(&cma_subsys.su_group); + mutex_init(&cma_subsys.su_mutex); + return configfs_register_subsystem(&cma_subsys); +} + +void __exit cma_configfs_exit(void) +{ + configfs_unregister_subsystem(&cma_subsys); +} diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 1945b4e..eab3221 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,9 +38,31 @@ #include +#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) +int cma_configfs_init(void); +void cma_configfs_exit(void); +#else +static inline int cma_configfs_init(void) +{ + return 0; +} + +static inline void cma_configfs_exit(void) +{ +} +#endif struct cma_device; void cma_ref_dev(struct cma_device *cma_dev); void cma_deref_dev(struct cma_device *cma_dev); +typedef bool (*cma_device_filter)(struct ib_device *, void *); +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie); +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port); +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type); +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev); int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, @@ -74,6 +96,8 @@ enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_DELETE }; +int ib_cache_gid_parse_type_str(const char *buf); + const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, -- cgit v1.1 From 25f40220e56b507fce186985c0ed2967b7f04596 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 23 Dec 2015 14:56:56 +0200 Subject: IB/core: Initialize UD header structure with IP and UDP headers ib_ud_header_init() is used to format InfiniBand headers in a buffer up to (but not with) BTH. For RoCE UDP ENCAP it is required that this function would be able to build also IP and UDP headers. Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/ud_header.c | 155 +++++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 72feee6..96697e7 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -116,6 +117,72 @@ static const struct ib_field vlan_table[] = { .size_bits = 16 } }; +static const struct ib_field ip4_table[] = { + { STRUCT_FIELD(ip4, ver), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, hdr_len), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, tos), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tot_len), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, id), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, frag_off), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, ttl), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, protocol), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, check), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, saddr), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(ip4, daddr), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 32 } +}; + +static const struct ib_field udp_table[] = { + { STRUCT_FIELD(udp, sport), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, dport), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(udp, length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, csum), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -213,26 +280,57 @@ static const struct ib_field deth_table[] = { .size_bits = 24 } }; +__be16 ib_ud_ip4_csum(struct ib_ud_header *header) +{ + struct iphdr iph; + + iph.ihl = 5; + iph.version = 4; + iph.tos = header->ip4.tos; + iph.tot_len = header->ip4.tot_len; + iph.id = header->ip4.id; + iph.frag_off = header->ip4.frag_off; + iph.ttl = header->ip4.ttl; + iph.protocol = header->ip4.protocol; + iph.check = 0; + iph.saddr = header->ip4.saddr; + iph.daddr = header->ip4.daddr; + + return ip_fast_csum((u8 *)&iph, iph.ihl); +} +EXPORT_SYMBOL(ib_ud_ip4_csum); + /** * ib_ud_header_init - Initialize UD header structure * @payload_bytes:Length of packet payload * @lrh_present: specify if LRH is present * @eth_present: specify if Eth header is present * @vlan_present: packet is tagged vlan - * @grh_present:GRH flag (if non-zero, GRH will be included) + * @grh_present: GRH flag (if non-zero, GRH will be included) + * @ip_version: if non-zero, IP header, V4 or V6, will be included + * @udp_present :if non-zero, UDP header will be included * @immediate_present: specify if immediate data is present * @header:Structure to initialize */ -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header) +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header) { + grh_present = grh_present && !ip_version; memset(header, 0, sizeof *header); + /* + * UDP header without IP header doesn't make sense + */ + if (udp_present && ip_version != 4 && ip_version != 6) + return -EINVAL; + if (lrh_present) { u16 packet_length; @@ -252,7 +350,7 @@ void ib_ud_header_init(int payload_bytes, if (vlan_present) header->eth.type = cpu_to_be16(ETH_P_8021Q); - if (grh_present) { + if (ip_version == 6 || grh_present) { header->grh.ip_version = 6; header->grh.payload_length = cpu_to_be16((IB_BTH_BYTES + @@ -260,8 +358,30 @@ void ib_ud_header_init(int payload_bytes, payload_bytes + 4 + /* ICRC */ 3) & ~3); /* round up */ - header->grh.next_header = 0x1b; + header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b; + } + + if (ip_version == 4) { + int udp_bytes = udp_present ? IB_UDP_BYTES : 0; + + header->ip4.ver = 4; /* version 4 */ + header->ip4.hdr_len = 5; /* 5 words */ + header->ip4.tot_len = + cpu_to_be16(IB_IP4_BYTES + + udp_bytes + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ + header->ip4.protocol = IPPROTO_UDP; } + if (udp_present && ip_version) + header->udp.length = + cpu_to_be16(IB_UDP_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; @@ -273,8 +393,11 @@ void ib_ud_header_init(int payload_bytes, header->lrh_present = lrh_present; header->eth_present = eth_present; header->vlan_present = vlan_present; - header->grh_present = grh_present; + header->grh_present = grh_present || (ip_version == 6); + header->ipv4_present = ip_version == 4; + header->udp_present = udp_present; header->immediate_present = immediate_present; + return 0; } EXPORT_SYMBOL(ib_ud_header_init); @@ -311,6 +434,16 @@ int ib_ud_header_pack(struct ib_ud_header *header, &header->grh, buf + len); len += IB_GRH_BYTES; } + if (header->ipv4_present) { + ib_pack(ip4_table, ARRAY_SIZE(ip4_table), + &header->ip4, buf + len); + len += IB_IP4_BYTES; + } + if (header->udp_present) { + ib_pack(udp_table, ARRAY_SIZE(udp_table), + &header->udp, buf + len); + len += IB_UDP_BYTES; + } ib_pack(bth_table, ARRAY_SIZE(bth_table), &header->bth, buf + len); -- cgit v1.1 From bee3c3c91865d520cb692689500df051e4ca3dd6 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 23 Dec 2015 14:56:57 +0200 Subject: IB/cma: Join and leave multicast groups with IGMP Since RoCEv2 is a protocol over IP header it is required to send IGMP join and leave requests to the network when joining and leaving multicast groups. Signed-off-by: Moni Shoua Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 96 ++++++++++++++++++++++++++++++++++--- drivers/infiniband/core/multicast.c | 17 ++++++- 2 files changed, 104 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 75987b0..559ee3d 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -304,6 +305,7 @@ struct cma_multicast { void *context; struct sockaddr_storage addr; struct kref mcref; + bool igmp_joined; }; struct cma_work { @@ -400,6 +402,26 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } +static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) +{ + struct in_device *in_dev = NULL; + + if (ndev) { + rtnl_lock(); + in_dev = __in_dev_get_rtnl(ndev); + if (in_dev) { + if (join) + ip_mc_inc_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + else + ip_mc_dec_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + } + rtnl_unlock(); + } + return (in_dev) ? 0 : -ENODEV; +} + static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { @@ -1532,8 +1554,24 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) id_priv->id.port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - } else + } else { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + } kref_put(&mc->mcref, release_mc); + } } } @@ -3645,12 +3683,23 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) event.status = status; event.param.ud.private_data = mc->context; if (!status) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = + dev_get_by_index(&init_net, dev_addr->bound_dev_if); + enum ib_gid_type gid_type = + id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, + ndev, gid_type, &event.param.ud.ah_attr); event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + if (ndev) + dev_put(ndev); } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; @@ -3783,9 +3832,10 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - int err; + int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + enum ib_gid_type gid_type; if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; @@ -3815,9 +3865,25 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + if (addr->sa_family == AF_INET) { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + true); + if (!err) { + mc->igmp_joined = true; + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + } + } else { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = -ENOTSUPP; + } dev_put(ndev); - if (!mc->multicast.ib->rec.mtu) { - err = -EINVAL; + if (err || !mc->multicast.ib->rec.mtu) { + if (!err) + err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, @@ -3856,7 +3922,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - + mc->igmp_joined = false; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); @@ -3901,9 +3967,25 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) if (rdma_cap_ib_mcast(id->device, id->port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - } else if (rdma_protocol_roce(id->device, id->port_num)) + } else if (rdma_protocol_roce(id->device, id->port_num)) { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id->route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + mc->igmp_joined = false; + } kref_put(&mc->mcref, release_mc); - + } return; } } diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 6911ae6..250937c 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -723,14 +723,27 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec); int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, IB_GID_TYPE_IB, - NULL, &p, &gid_index); + if (rdma_protocol_roce(device, port_num)) { + ret = ib_find_cached_gid_by_port(device, &rec->port_gid, + gid_type, port_num, + ndev, + &gid_index); + } else if (rdma_protocol_ib(device, port_num)) { + ret = ib_find_cached_gid(device, &rec->port_gid, + IB_GID_TYPE_IB, NULL, &p, + &gid_index); + } else { + ret = -EINVAL; + } + if (ret) return ret; -- cgit v1.1 From a4d825a01e51b9c74d5a64237dd8b901822cf035 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Dec 2015 19:12:46 +0100 Subject: IB: remove ib_query_mr This functionality has no users and was only supported by the staged out EHCA driver. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Jason Gunthorpe [core] Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 063210b..70b1016 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1352,13 +1352,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) } EXPORT_SYMBOL(ib_get_dma_mr); -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) -{ - return mr->device->query_mr ? - mr->device->query_mr(mr, mr_attr) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_mr); - int ib_dereg_mr(struct ib_mr *mr) { struct ib_pd *pd; -- cgit v1.1 From feb7c1e38bccfd18cc06677cb648ed2340788fe8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Dec 2015 19:12:48 +0100 Subject: IB: remove in-kernel support for memory windows Remove the unused ib_allow_mw and ib_bind_mw functions, remove the unused IB_WR_BIND_MW and IB_WC_BIND_MW opcodes and move ib_dealloc_mw into the uverbs module. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Jason Gunthorpe [core] Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs.h | 2 ++ drivers/infiniband/core/uverbs_cmd.c | 4 ++-- drivers/infiniband/core/uverbs_main.c | 13 ++++++++++++- drivers/infiniband/core/verbs.c | 36 ----------------------------------- 4 files changed, 16 insertions(+), 39 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 94bbd8c..612ccfd 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -204,6 +204,8 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +int uverbs_dealloc_mw(struct ib_mw *mw); + struct ib_uverbs_flow_spec { union { union { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9561056..5428ebe 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1243,7 +1243,7 @@ err_copy: idr_remove_uobj(&ib_uverbs_mw_idr, uobj); err_unalloc: - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); err_put: put_pd_read(pd); @@ -1272,7 +1272,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, mw = uobj->object; - ret = ib_dealloc_mw(mw); + ret = uverbs_dealloc_mw(mw); if (!ret) uobj->live = 0; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index e3ef288..39680ae 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -133,6 +133,17 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); +int uverbs_dealloc_mw(struct ib_mw *mw) +{ + struct ib_pd *pd = mw->pd; + int ret; + + ret = mw->device->dealloc_mw(mw); + if (!ret) + atomic_dec(&pd->usecnt); + return ret; +} + static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = @@ -224,7 +235,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, struct ib_mw *mw = uobj->object; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); kfree(uobj); } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 70b1016..c5e0f07 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1403,42 +1403,6 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, } EXPORT_SYMBOL(ib_alloc_mr); -/* Memory windows */ - -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) -{ - struct ib_mw *mw; - - if (!pd->device->alloc_mw) - return ERR_PTR(-ENOSYS); - - mw = pd->device->alloc_mw(pd, type); - if (!IS_ERR(mw)) { - mw->device = pd->device; - mw->pd = pd; - mw->uobject = NULL; - mw->type = type; - atomic_inc(&pd->usecnt); - } - - return mw; -} -EXPORT_SYMBOL(ib_alloc_mw); - -int ib_dealloc_mw(struct ib_mw *mw) -{ - struct ib_pd *pd; - int ret; - - pd = mw->pd; - ret = mw->device->dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_mw); - /* "Fast" memory regions */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, -- cgit v1.1 From ab67ed8de0250e9ad7956ff4d98c3c98858b6c3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Dec 2015 19:12:54 +0100 Subject: IB: remove the write-only usecnt field from struct ib_mr Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 6 ------ drivers/infiniband/core/verbs.c | 8 +------- 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5428ebe..0a84182d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -993,7 +993,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); uobj->object = mr; ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); @@ -1091,11 +1090,6 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, } } - if (atomic_read(&mr->usecnt)) { - ret = -EBUSY; - goto put_uobj_pd; - } - old_pd = mr->pd; ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, cmd.hca_va, diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c5e0f07..072b94d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1345,7 +1345,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); } return mr; @@ -1354,13 +1353,9 @@ EXPORT_SYMBOL(ib_get_dma_mr); int ib_dereg_mr(struct ib_mr *mr) { - struct ib_pd *pd; + struct ib_pd *pd = mr->pd; int ret; - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; ret = mr->device->dereg_mr(mr); if (!ret) atomic_dec(&pd->usecnt); @@ -1396,7 +1391,6 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); } return mr; -- cgit v1.1 From 35c4cbb178119bcacc720f54d32711da020805d5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 21 Dec 2015 08:20:27 -0600 Subject: IB/core: Create get_perf_mad function in sysfs.c Create a new function to retrieve performance management data from the existing code in get_pma_counter(). Reviewed-by: Hal Rosenstock Signed-off-by: Christoph Lameter Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 62 ++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 91847d38..3eeb245 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -398,21 +398,21 @@ struct port_table_attribute port_pma_attr_##_name = { \ .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ } -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) +/* + * Get a Perfmgmt MAD block of data. + * Returns error code or the number of bytes retrieved. + */ +static int get_perf_mad(struct ib_device *dev, int port_num, int attr, + void *data, int offset, size_t size) { - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - int offset = tab_attr->index & 0xffff; - int width = (tab_attr->index >> 16) & 0xff; - struct ib_mad *in_mad = NULL; - struct ib_mad *out_mad = NULL; + struct ib_mad *in_mad; + struct ib_mad *out_mad; size_t mad_size = sizeof(*out_mad); u16 out_mad_pkey_index = 0; ssize_t ret; - if (!p->ibdev->process_mad) - return sprintf(buf, "N/A (no PMA)\n"); + if (!dev->process_mad) + return -ENOSYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -425,12 +425,12 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; in_mad->mad_hdr.class_version = 1; in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */ + in_mad->mad_hdr.attr_id = attr; - in_mad->data[41] = p->port_num; /* PortSelect field */ + in_mad->data[41] = port_num; /* PortSelect field */ - if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, + if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY, + port_num, NULL, NULL, (const struct ib_mad_hdr *)in_mad, mad_size, (struct ib_mad_hdr *)out_mad, &mad_size, &out_mad_pkey_index) & @@ -439,31 +439,49 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = -EINVAL; goto out; } + memcpy(data, out_mad->data + offset, size); + ret = size; +out: + kfree(in_mad); + kfree(out_mad); + return ret; +} + +static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + ssize_t ret; + u8 data[8]; + + ret = get_perf_mad(p->ibdev, p->port_num, cpu_to_be16(0x12), &data, + 40 + offset / 8, sizeof(data)); + if (ret < 0) + return sprintf(buf, "N/A (no PMA)\n"); switch (width) { case 4: - ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> + ret = sprintf(buf, "%u\n", (*data >> (4 - (offset % 8))) & 0xf); break; case 8: - ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); + ret = sprintf(buf, "%u\n", *data); break; case 16: ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); + be16_to_cpup((__be16 *)data)); break; case 32: ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); + be32_to_cpup((__be32 *)data)); break; default: ret = 0; } -out: - kfree(in_mad); - kfree(out_mad); - return ret; } -- cgit v1.1 From b2788ce575df94fb9b03e546980c4f916b24ef76 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 21 Dec 2015 08:20:28 -0600 Subject: IB/core: Specify attribute_id in port_table_attribute Add the attr_id on port_table_attribute since we will have to add a different port_table_attribute for the extended attribute soon. Reviewed-by: Hal Rosenstock Signed-off-by: Christoph Lameter Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 3eeb245..09cee09 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -40,6 +40,7 @@ #include #include +#include struct ib_port; @@ -75,6 +76,7 @@ struct port_table_attribute { struct port_attribute attr; char name[8]; int index; + int attr_id; }; static ssize_t port_attr_show(struct kobject *kobj, @@ -395,7 +397,8 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, #define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ struct port_table_attribute port_pma_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ + .attr_id = IB_PMA_PORT_COUNTERS , \ } /* @@ -457,7 +460,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ssize_t ret; u8 data[8]; - ret = get_perf_mad(p->ibdev, p->port_num, cpu_to_be16(0x12), &data, + ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) return sprintf(buf, "N/A (no PMA)\n"); -- cgit v1.1 From 145d9c5410324a6f64d8847f115e5f4fdfddce39 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 21 Dec 2015 08:20:29 -0600 Subject: IB/core: Display extended counter set if available Check if the extended counters are available and if so create the proper extended and additional counters. Signed-off-by: Christoph Lameter Reviewed-by: Ira Weiny Reviewed-by: Hal Rosenstock Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 110 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 09cee09..539040f 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -57,6 +57,7 @@ struct ib_port { struct attribute_group gid_group; struct attribute_group pkey_group; u8 port_num; + struct attribute_group *pma_table; }; struct port_attribute { @@ -401,6 +402,13 @@ struct port_table_attribute port_pma_attr_##_name = { \ .attr_id = IB_PMA_PORT_COUNTERS , \ } +#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16), \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT , \ +} + /* * Get a Perfmgmt MAD block of data. * Returns error code or the number of bytes retrieved. @@ -481,6 +489,11 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = sprintf(buf, "%u\n", be32_to_cpup((__be32 *)data)); break; + case 64: + ret = sprintf(buf, "%llu\n", + be64_to_cpup((__be64 *)data)); + break; + default: ret = 0; } @@ -505,6 +518,18 @@ static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +/* + * Counters added by extended set + */ +static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256); +static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320); +static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384); +static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448); +static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512); + static struct attribute *pma_attrs[] = { &port_pma_attr_symbol_error.attr.attr, &port_pma_attr_link_error_recovery.attr.attr, @@ -525,11 +550,65 @@ static struct attribute *pma_attrs[] = { NULL }; +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_multicast_rcv_packets.attr.attr, + &port_pma_attr_ext_multicast_xmit_packets.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_noietf[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + NULL +}; + static struct attribute_group pma_group = { .name = "counters", .attrs = pma_attrs }; +static struct attribute_group pma_group_ext = { + .name = "counters", + .attrs = pma_attrs_ext +}; + +static struct attribute_group pma_group_noietf = { + .name = "counters", + .attrs = pma_attrs_noietf +}; + static void ib_port_release(struct kobject *kobj) { struct ib_port *p = container_of(kobj, struct ib_port, kobj); @@ -631,6 +710,30 @@ err: return NULL; } +/* + * Figure out which counter table to use depending on + * the device capabilities. + */ +static struct attribute_group *get_counter_table(struct ib_device *dev) +{ + struct ib_class_port_info cpi; + + if (get_perf_mad(dev, 0, IB_PMA_CLASS_PORT_INFO, + &cpi, 40, sizeof(cpi)) >= 0) { + + if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH) + /* We have extended counters */ + return &pma_group_ext; + + if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF) + /* But not the IETF ones */ + return &pma_group_noietf; + } + + /* Fall back to normal counters */ + return &pma_group; +} + static int add_port(struct ib_device *device, int port_num, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -673,7 +776,8 @@ static int add_port(struct ib_device *device, int port_num, goto err_put; } - ret = sysfs_create_group(&p->kobj, &pma_group); + p->pma_table = get_counter_table(device); + ret = sysfs_create_group(&p->kobj, p->pma_table); if (ret) goto err_put_gid_attrs; @@ -780,7 +884,7 @@ err_free_gid: p->gid_group.attrs = NULL; err_remove_pma: - sysfs_remove_group(&p->kobj, &pma_group); + sysfs_remove_group(&p->kobj, p->pma_table); err_put_gid_attrs: kobject_put(&p->gid_attr_group->kobj); @@ -990,7 +1094,7 @@ static void free_port_list_attributes(struct ib_device *device) list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); list_del(&p->entry); - sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); sysfs_remove_group(&port->gid_attr_group->kobj, -- cgit v1.1 From 8a06ce59a4cd034c52c59c44ff6b0785a3969409 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 20 Dec 2015 12:16:10 +0200 Subject: IB/core: Add cross-channel support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cross-channel feature allows to execute WQEs that involve synchronization of I/O operations’ on different QPs. This capability enables to program complex flows with a single function call, hereby significantly reducing overhead associated with I/O processing. Cross-channel operations support is indicated by HCA capability information. The queue pairs can be configured to work as a “sync master queue” or “sync slave queues”. The added flags are: 1. Device capability flag IB_DEVICE_CROSS_CHANNEL for the devices that can perform cross-channel operations. 2. CQ property flag IB_CQ_FLAGS_IGNORE_OVERRUN to disable CQ overrun check. This check is useless in cross-channel scenario. 3. QP property flags to indicate if queues are slave or master: * IB_QP_CREATE_MANAGED_SEND indicates that posted send work requests will not be executed immediately and requires enabling. * IB_QP_CREATE_MANAGED_RECV indicates that posted receive work requests will not be executed immediately and requires enabling. * IB_QP_CREATE_CROSS_CHANNEL declares the QP to work in cross-channel mode. If IB_QP_CREATE_MANAGED_SEND and IB_QP_CREATE_MANAGED_RECV are not provided, this QP will be sync master queue, else it will be sync slave. Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 0a84182d..6ffc9c4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1824,7 +1824,10 @@ static int create_qp(struct ib_uverbs_file *file, sizeof(cmd->create_flags)) attr.create_flags = cmd->create_flags; - if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) { ret = -EINVAL; goto err_put; } -- cgit v1.1 From 0d6ed314de18b65a8063cbed450a2ca0c6a16c52 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 10 Dec 2015 16:52:30 -0500 Subject: IB/mad: Ensure fairness in ib_mad_completion_handler It was found that when a process was rapidly sending MADs other processes could be hung in their unregister calls. This would happen when process A was injecting packets fast enough that the single threaded workqueue was never exiting ib_mad_completion_handler. Therefore when process B called flush_workqueue via the unregister call it would hang until process A stopped sending MADs. The fix is to periodically reschedule ib_mad_completion_handler after processing a large number of completions. The number of completions chosen was decided based on the defaults for the recv queue size. However, it was kept fixed such that increasing those queue sizes would not adversely affect fairness in the future. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 2281de1..d4d2a61 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -61,6 +61,18 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); +/* + * Define a limit on the number of completions which will be processed by the + * worker thread in a single work item. This ensures that other work items + * (potentially from other users) are processed fairly. + * + * The number of completions was derived from the default queue sizes above. + * We use a value which is double the larger of the 2 queues (receive @ 512) + * but keep it fixed such that an increase in that value does not introduce + * unfairness. + */ +#define MAD_COMPLETION_PROC_LIMIT 1024 + static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; @@ -2555,6 +2567,7 @@ static void ib_mad_completion_handler(struct work_struct *work) { struct ib_mad_port_private *port_priv; struct ib_wc wc; + int count = 0; port_priv = container_of(work, struct ib_mad_port_private, work); ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); @@ -2574,6 +2587,11 @@ static void ib_mad_completion_handler(struct work_struct *work) } } else mad_error_handler(port_priv, &wc); + + if (++count > MAD_COMPLETION_PROC_LIMIT) { + queue_work(port_priv->wq, &port_priv->work); + break; + } } } -- cgit v1.1 From 46e741f4105320875f70b94abaa1e6b089c6c354 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 29 Nov 2015 23:02:51 +0100 Subject: IB/core: constify mmu_notifier_ops structures This mmu_notifier_ops structure is never modified, so declare it as const, like the other mmu_notifier_ops structures. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 40becdb..e69bf26 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -232,7 +232,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, ib_ucontext_notifier_end_account(context); } -static struct mmu_notifier_ops ib_umem_notifiers = { +static const struct mmu_notifier_ops ib_umem_notifiers = { .release = ib_umem_notifier_release, .invalidate_page = ib_umem_notifier_invalidate_page, .invalidate_range_start = ib_umem_notifier_invalidate_range_start, -- cgit v1.1 From 649367735ee5dedb128d9fac0b86ba7e0fe7ae3b Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 7 Jan 2016 11:19:29 +0200 Subject: IB/cma: Fix RDMA port validation for iWarp cma_validate_port wrongly assumed that Ethernet devices are RoCE devices and thus their ndev should be matched in the GID table. This broke the iWarp support. Fixing that matching the ndev only if we work on a RoCE port. Cc: # 4.4.x- Fixes: abae1b71dd37 ('IB/cma: cma_validate_port should verify the port and netdevice') Reported-by: Hariprasad Shenai Tested-by: Hariprasad Shenai Signed-off-by: Matan Barak Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 559ee3d..a811594 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -551,7 +551,7 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER) { + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(&init_net, bound_if_index); if (ndev && ndev->flags & IFF_LOOPBACK) { pr_info("detected loopback device\n"); -- cgit v1.1 From 65487fdc0c8072b6ac32ed30fab34de4e66ace55 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sun, 3 Jan 2016 22:44:25 -0500 Subject: IB/sysfs: Fix sparse warning on attr_id Attributed ID was declared as an int while the value should really be big endian 16. Fixes: 35c4cbb17811 ("IB/core: Create get_perf_mad function in sysfs.c") Reported-by: Bart Van Assche Signed-off-by: Ira Weiny Reviewed-by: Bart Van Assche Reviewed-by: Christoph Lameter Reviewed-by: Hal Rosenstock Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 539040f..aed9b2d 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -77,7 +77,7 @@ struct port_table_attribute { struct port_attribute attr; char name[8]; int index; - int attr_id; + __be16 attr_id; }; static ssize_t port_attr_show(struct kobject *kobj, @@ -413,7 +413,7 @@ struct port_table_attribute port_pma_attr_ext_##_name = { \ * Get a Perfmgmt MAD block of data. * Returns error code or the number of bytes retrieved. */ -static int get_perf_mad(struct ib_device *dev, int port_num, int attr, +static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, void *data, int offset, size_t size) { struct ib_mad *in_mad; -- cgit v1.1 From a7d0e959fab832d0614d0e144cb01cbb87d36258 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 12 Jan 2016 12:29:21 +0300 Subject: IB/cma: allocating too much memory in make_cma_ports() The issue here is that there is a cut and paste bug. When we allocate cma_dev_group->default_ports_group we use "sizeof(*cma_dev_group->ports)" instead of "sizeof(*cma_dev_group->default_ports_group)". We're bumping up against the 80 character limit so I introduced a new local pointer "ports_group" to get around that. Fixes: 045959db65c6 ('IB/cma: Add configfs for rdma_cm') Signed-off-by: Dan Carpenter Acked-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma_configfs.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index bd1d640..18b112a 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -158,6 +158,7 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, unsigned int i; unsigned int ports_num; struct cma_dev_port_group *ports; + struct config_group **ports_group; int err; ibdev = cma_get_ib_dev(cma_dev); @@ -168,12 +169,9 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, ports_num = ibdev->phys_port_cnt; ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports), GFP_KERNEL); + ports_group = kcalloc(ports_num + 1, sizeof(*ports_group), GFP_KERNEL); - cma_dev_group->default_ports_group = kcalloc(ports_num + 1, - sizeof(*cma_dev_group->ports), - GFP_KERNEL); - - if (!ports || !cma_dev_group->default_ports_group) { + if (!ports || !ports_group) { err = -ENOMEM; goto free; } @@ -187,15 +185,16 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, config_group_init_type_name(&ports[i].group, port_str, &cma_port_group_type); - cma_dev_group->default_ports_group[i] = &ports[i].group; + ports_group[i] = &ports[i].group; } - cma_dev_group->default_ports_group[i] = NULL; + ports_group[i] = NULL; + cma_dev_group->default_ports_group = ports_group; cma_dev_group->ports = ports; return 0; free: kfree(ports); - kfree(cma_dev_group->default_ports_group); + kfree(ports_group); cma_dev_group->ports = NULL; cma_dev_group->default_ports_group = NULL; return err; -- cgit v1.1 From ca281265c02f342fed4927b4e98e377d9318881f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Jan 2016 14:15:58 +0100 Subject: IB/mad: pass ib_mad_send_buf explicitly to the recv_handler Stop abusing wr_id and just pass the parameter explicitly. Signed-off-by: Christoph Hellwig Reviewed-by: Hal Rosenstock Reviewed-by: Ira Weiny Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 1 + drivers/infiniband/core/mad.c | 18 ++++++++++-------- drivers/infiniband/core/sa_query.c | 7 ++++--- drivers/infiniband/core/user_mad.c | 1 + 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index e3a95d1..ad3726d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3503,6 +3503,7 @@ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct cm_port *port = mad_agent->context; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index d4d2a61..cbe232a 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -693,7 +693,7 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info, atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, + mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, mad_recv_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); @@ -1994,9 +1994,9 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, /* user rmpp is in effect * and this is an active RMPP MAD */ - mad_recv_wc->wc->wr_id = 0; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, NULL, + mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); } else { /* not user rmpp, revert to normal behavior and @@ -2010,9 +2010,10 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ - mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, + &mad_send_wr->send_buf, + mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); mad_send_wc.status = IB_WC_SUCCESS; @@ -2021,7 +2022,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); } } else { - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, mad_recv_wc); deref_mad_agent(mad_agent_priv); } @@ -2762,6 +2763,7 @@ static void local_completions(struct work_struct *work) IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, + &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); atomic_dec(&recv_mad_agent->refcount); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index e364a42..1f91b6e 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1669,14 +1669,15 @@ static void send_handler(struct ib_mad_agent *agent, } static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; - struct ib_mad_send_buf *mad_buf; - mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id; - query = mad_buf->context[0]; + if (!send_buf) + return; + query = send_buf->context[0]; if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 57f281f..415a318 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -210,6 +210,7 @@ static void send_handler(struct ib_mad_agent *agent, } static void recv_handler(struct ib_mad_agent *agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_umad_file *file = agent->context; -- cgit v1.1 From d53e11fdf0538a972ad4f1e5ec315d3e35c1aae9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Jan 2016 22:46:12 -0800 Subject: IB/mad: use CQ abstraction Remove the local workqueue to process mad completions and use the CQ API instead. Signed-off-by: Christoph Hellwig Reviewed-by: Hal Rosenstock Reviewed-by: Ira Weiny Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad.c | 162 +++++++++++++------------------------ drivers/infiniband/core/mad_priv.h | 2 +- 2 files changed, 59 insertions(+), 105 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index cbe232a..9fa5bf3 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -61,18 +61,6 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -/* - * Define a limit on the number of completions which will be processed by the - * worker thread in a single work item. This ensures that other work items - * (potentially from other users) are processed fairly. - * - * The number of completions was derived from the default queue sizes above. - * We use a value which is double the larger of the 2 queues (receive @ 512) - * but keep it fixed such that an increase in that value does not introduce - * unfairness. - */ -#define MAD_COMPLETION_PROC_LIMIT 1024 - static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; @@ -96,6 +84,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc); +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); /* * Returns a ib_mad_port_private structure or NULL for a device/port @@ -701,12 +692,11 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info, spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } -static void build_smp_wc(struct ib_qp *qp, - u64 wr_id, u16 slid, u16 pkey_index, u8 port_num, - struct ib_wc *wc) +static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, + u16 pkey_index, u8 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); - wc->wr_id = wr_id; + wc->wr_cqe = cqe; wc->status = IB_WC_SUCCESS; wc->opcode = IB_WC_RECV; wc->pkey_index = pkey_index; @@ -844,7 +834,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, } build_smp_wc(mad_agent_priv->agent.qp, - send_wr->wr.wr_id, drslid, + send_wr->wr.wr_cqe, drslid, send_wr->pkey_index, send_wr->port_num, &mad_wc); @@ -1051,7 +1041,9 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; - mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; mad_send_wr->send_wr.wr.num_sge = 2; mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; @@ -1163,8 +1155,9 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; - mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; @@ -2185,13 +2178,14 @@ handle_smi(struct ib_mad_port_private *port_priv, return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); } -static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_qp_info *qp_info; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; - struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; @@ -2199,7 +2193,17 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, u16 resp_mad_pkey_index = 0; bool opa; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + /* + * Receive errors indicate that the QP has entered the error + * state - error handling/shutdown code will cleanup + */ + return; + } + qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); @@ -2240,7 +2244,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) { dev_err(&port_priv->device->dev, - "ib_mad_recv_done_handler no memory for response buffer\n"); + "%s: no memory for response buffer\n", __func__); goto out; } @@ -2426,11 +2430,12 @@ done: spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; - struct ib_mad_list_head *mad_list; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; struct ib_send_wr *bad_send_wr; @@ -2438,7 +2443,14 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, unsigned long flags; int ret; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + if (!ib_mad_send_error(port_priv, wc)) + return; + } + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); send_queue = mad_list->mad_queue; @@ -2503,24 +2515,15 @@ static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); } -static void mad_error_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) { - struct ib_mad_list_head *mad_list; - struct ib_mad_qp_info *qp_info; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info; struct ib_mad_send_wr_private *mad_send_wr; int ret; - /* Determine if failure was a send or receive */ - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; - qp_info = mad_list->mad_queue->qp_info; - if (mad_list->mad_queue == &qp_info->recv_queue) - /* - * Receive errors indicate that the QP has entered the error - * state - error handling/shutdown code will cleanup - */ - return; - /* * Send errors will transition the QP to SQE - move * QP to RTS and repost flushed work requests @@ -2535,10 +2538,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, mad_send_wr->retry = 0; ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); - if (ret) - ib_mad_send_done_handler(port_priv, wc); - } else - ib_mad_send_done_handler(port_priv, wc); + if (!ret) + return false; + } } else { struct ib_qp_attr *attr; @@ -2552,48 +2554,14 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, kfree(attr); if (ret) dev_err(&port_priv->device->dev, - "mad_error_handler - ib_modify_qp to RTS : %d\n", - ret); + "%s - ib_modify_qp to RTS: %d\n", + __func__, ret); else mark_sends_for_retry(qp_info); } - ib_mad_send_done_handler(port_priv, wc); } -} -/* - * IB MAD completion callback - */ -static void ib_mad_completion_handler(struct work_struct *work) -{ - struct ib_mad_port_private *port_priv; - struct ib_wc wc; - int count = 0; - - port_priv = container_of(work, struct ib_mad_port_private, work); - ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); - - while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { - if (wc.status == IB_WC_SUCCESS) { - switch (wc.opcode) { - case IB_WC_SEND: - ib_mad_send_done_handler(port_priv, &wc); - break; - case IB_WC_RECV: - ib_mad_recv_done_handler(port_priv, &wc); - break; - default: - BUG_ON(1); - break; - } - } else - mad_error_handler(port_priv, &wc); - - if (++count > MAD_COMPLETION_PROC_LIMIT) { - queue_work(port_priv->wq, &port_priv->work); - break; - } - } + return true; } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) @@ -2735,7 +2703,7 @@ static void local_completions(struct work_struct *work) * before request */ build_smp_wc(recv_mad_agent->agent.qp, - (unsigned long) local->mad_send_wr, + local->mad_send_wr->send_wr.wr.wr_cqe, be16_to_cpu(IB_LID_PERMISSIVE), local->mad_send_wr->send_wr.pkey_index, recv_mad_agent->agent.port_num, &wc); @@ -2875,17 +2843,6 @@ static void timeout_sends(struct work_struct *work) spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg) -{ - struct ib_mad_port_private *port_priv = cq->cq_context; - unsigned long flags; - - spin_lock_irqsave(&ib_mad_port_list_lock, flags); - if (!list_empty(&port_priv->port_list)) - queue_work(port_priv->wq, &port_priv->work); - spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); -} - /* * Allocate receive MADs and post receive WRs for them */ @@ -2933,8 +2890,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, break; } mad_priv->header.mapping = sg_list.addr; - recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; + mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; + recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); @@ -3171,7 +3129,6 @@ static int ib_mad_port_open(struct ib_device *device, unsigned long flags; char name[sizeof "ib_mad123"]; int has_smi; - struct ib_cq_init_attr cq_attr = {}; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) return -EFAULT; @@ -3199,10 +3156,8 @@ static int ib_mad_port_open(struct ib_device *device, if (has_smi) cq_size *= 2; - cq_attr.cqe = cq_size; - port_priv->cq = ib_create_cq(port_priv->device, - ib_mad_thread_completion_handler, - NULL, port_priv, &cq_attr); + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, + IB_POLL_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); @@ -3231,7 +3186,6 @@ static int ib_mad_port_open(struct ib_device *device, ret = -ENOMEM; goto error8; } - INIT_WORK(&port_priv->work, ib_mad_completion_handler); spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); @@ -3258,7 +3212,7 @@ error7: error6: ib_dealloc_pd(port_priv->pd); error4: - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); error3: @@ -3291,7 +3245,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); ib_dealloc_pd(port_priv->pd); - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 990698a..28669f6 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -64,6 +64,7 @@ struct ib_mad_list_head { struct list_head list; + struct ib_cqe cqe; struct ib_mad_queue *mad_queue; }; @@ -204,7 +205,6 @@ struct ib_mad_port_private { struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; struct list_head agent_list; struct workqueue_struct *wq; - struct work_struct work; struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; }; -- cgit v1.1 From b6aeb980f1e143f3237365710290b199e9363eeb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 29 Dec 2015 10:45:03 +0100 Subject: IB/core: Remove set-but-not-used variable from ib_sg_to_pages() Detected this by building the IB core with W=1. See also patch "IB core: Fix ib_sg_to_pages()" (commit 8f5ba10ed40a). Signed-off-by: Bart Van Assche Cc: Sagi Grimberg Cc: Christoph Hellwig Reviewed-by: Leon Romanovsky Acked-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 072b94d..95fccea 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1611,7 +1611,7 @@ int ib_sg_to_pages(struct ib_mr *mr, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; - u64 last_end_dma_addr = 0, last_page_addr = 0; + u64 last_end_dma_addr = 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; @@ -1653,7 +1653,6 @@ next_page: mr->length += dma_len; last_end_dma_addr = end_dma_addr; - last_page_addr = end_dma_addr & page_mask; last_page_off = end_dma_addr & ~page_mask; } -- cgit v1.1 From 6e2a51a0f7c89a22169f366d23d5f213b7b7edaf Mon Sep 17 00:00:00 2001 From: Hal Rosenstock Date: Tue, 29 Dec 2015 05:43:43 -0500 Subject: IB/core: sysfs.c: Fix PerfMgt ClassPortInfo handling Port number is not part of ClassPortInfo attribute but is still needed as a parameter when invoking process_mad. To properly handle this attribute, port_num is added as a parameter to get_counter_table and get_perf_mad was changed not to store port_num in the attribute itself when it's querying the ClassPortInfo attribute. This handles issue pointed out by Matan Barak Fixes: 145d9c541032 ('IB/core: Display extended counter set if available') Signed-off-by: Hal Rosenstock Acked-by: Matan Barak Acked-by: Ira Weiny Reviewed-by: Christoph Lameter Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index aed9b2d..3de9351 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -438,7 +438,8 @@ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; in_mad->mad_hdr.attr_id = attr; - in_mad->data[41] = port_num; /* PortSelect field */ + if (attr != IB_PMA_CLASS_PORT_INFO) + in_mad->data[41] = port_num; /* PortSelect field */ if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY, port_num, NULL, NULL, @@ -714,11 +715,12 @@ err: * Figure out which counter table to use depending on * the device capabilities. */ -static struct attribute_group *get_counter_table(struct ib_device *dev) +static struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) { struct ib_class_port_info cpi; - if (get_perf_mad(dev, 0, IB_PMA_CLASS_PORT_INFO, + if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO, &cpi, 40, sizeof(cpi)) >= 0) { if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH) @@ -776,7 +778,7 @@ static int add_port(struct ib_device *device, int port_num, goto err_put; } - p->pma_table = get_counter_table(device); + p->pma_table = get_counter_table(device, port_num); ret = sysfs_create_group(&p->kobj, p->pma_table); if (ret) goto err_put_gid_attrs; -- cgit v1.1 From 2e2cdace5a26507a564d319a1338920a838ef52e Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 30 Dec 2015 16:14:17 +0200 Subject: IB/core: Eliminate sparse false context imbalance warning When write_gid function needs to do a sleep-able operation, it unlocks table->rwlock and then relocks it. Sparse complains about context imbalance. This is safe as write_gid is always called with table->rwlock. write_gid protects from simultaneous writes to this GID entry by setting the GID_TABLE_ENTRY_INVALID flag. Fixes: 9c584f049596 ('IB/core: Change per-entry lock in RoCE GID table to one lock') Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 92cadbd..53343ff 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -174,6 +174,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, const struct ib_gid_attr *attr, enum gid_table_write_action action, bool default_gid) + __releases(&table->rwlock) __acquires(&table->rwlock) { int ret = 0; struct net_device *old_net_dev; -- cgit v1.1 From 9506902b7be2d8ccc235c14ccaec7f07b2c7d520 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 30 Dec 2015 16:14:18 +0200 Subject: IB/core: Fix dereference before check Sparse complains about dereference before check. Fixing this by moving the check before the dereference. Fixes: 200298326b27 ('IB/core: Validate route when we init ah') Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 0b5f245..791cc98 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -497,13 +497,14 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, struct sockaddr_storage ssrc_addr = {}; struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; - if (src_addr->sa_family != dst_addr->sa_family) - return -EINVAL; + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) + return -EINVAL; - if (src_addr) memcpy(src_in, src_addr, rdma_addr_size(src_addr)); - else + } else { src_in->sa_family = dst_addr->sa_family; + } return addr_resolve(src_in, dst_addr, addr, false); } -- cgit v1.1 From 4bfdf635c668869c69fd18ece37ec66fb6f38fcf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 1 Jan 2016 13:17:46 +0100 Subject: IB/cm: Fix a recently introduced deadlock ib_send_cm_drep() calls cm_enter_timewait() while holding a spinlock that can be locked from inside an interrupt handler. Hence do not enable interrupts inside cm_enter_timewait() if called with interrupts disabled. This patch fixes e.g. the following deadlock: Acked-by: Erez Shitrit ================================= [ INFO: inconsistent lock state ] 4.4.0-rc7+ #1 Tainted: G E --------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. swapper/8/0 [HC1[1]:SC0[0]:HE0:SE1] takes: (&(&cm_id_priv->lock)->rlock){?.+...}, at: [] cm_establish+0x 74/0x1b0 [ib_cm] {HARDIRQ-ON-W} state was registered at: [] mark_held_locks+0x71/0x90 [] trace_hardirqs_on_caller+0xa7/0x1c0 [] trace_hardirqs_on+0xd/0x10 [] _raw_spin_unlock_irq+0x2b/0x40 [] cm_enter_timewait+0xae/0x100 [ib_cm] [] ib_send_cm_drep+0xb6/0x190 [ib_cm] [] srp_cm_handler+0x128/0x1a0 [ib_srp] [] cm_process_work+0x20/0xf0 [ib_cm] [] cm_dreq_handler+0x135/0x2c0 [ib_cm] [] cm_work_handler+0x75/0xd0 [ib_cm] [] process_one_work+0x1bd/0x460 [] worker_thread+0x118/0x420 [] kthread+0xe4/0x100 [] ret_from_fork+0x3f/0x70 irq event stamp: 1672286 hardirqs last enabled at (1672283): [] poll_idle+0x10/0x80 hardirqs last disabled at (1672284): [] common_interrupt+0x84/0x89 softirqs last enabled at (1672286): [] _local_bh_enable+0x1c/0x50 softirqs last disabled at (1672285): [] irq_enter+0x47/0x70 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&cm_id_priv->lock)->rlock); lock(&(&cm_id_priv->lock)->rlock); *** DEADLOCK *** no locks held by swapper/8/0. stack backtrace: CPU: 8 PID: 0 Comm: swapper/8 Tainted: G E 4.4.0-rc7+ #1 Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 ffff88045af5e950 ffff88046e503a88 ffffffff81251c1b 0000000000000007 0000000000000006 0000000000000003 ffff88045af5ddc0 ffff88046e503ad8 ffffffff810a32f4 0000000000000000 0000000000000000 0000000000000001 Call Trace: [] dump_stack+0x4f/0x74 [] print_usage_bug+0x184/0x190 [] mark_lock_irq+0xf2/0x290 [] mark_lock+0x115/0x1b0 [] mark_irqflags+0x15c/0x170 [] __lock_acquire+0x1ef/0x560 [] lock_acquire+0x62/0x80 [] _raw_spin_lock_irqsave+0x43/0x60 [] cm_establish+0x74/0x1b0 [ib_cm] [] ib_cm_notify+0x31/0x100 [ib_cm] [] srpt_qp_event+0x54/0xd0 [ib_srpt] [] mlx4_ib_qp_event+0x72/0xc0 [mlx4_ib] [] mlx4_qp_event+0x69/0xd0 [mlx4_core] [] mlx4_eq_int+0x51e/0xd50 [mlx4_core] [] mlx4_msi_x_interrupt+0xf/0x20 [mlx4_core] [] handle_irq_event_percpu+0x40/0x110 [] handle_irq_event+0x3f/0x70 [] handle_edge_irq+0x79/0x120 [] handle_irq+0x5d/0x130 [] do_IRQ+0x6d/0x130 [] common_interrupt+0x89/0x89 [] cpuidle_enter_state+0xcf/0x200 [] cpuidle_enter+0x12/0x20 [] call_cpuidle+0x36/0x60 [] cpuidle_idle_call+0x63/0x110 [] cpu_idle_loop+0xfa/0x130 [] cpu_startup_entry+0xe/0x10 [] start_secondary+0x83/0x90 Fixes: commit be4b499323bf ("IB/cm: Do not queue work to a device that's going away") Signed-off-by: Bart Van Assche Cc: Erez Shitrit Cc: stable Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index ad3726d..0ba0463 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -782,11 +782,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); /* Check if the device started its remove_one */ - spin_lock_irq(&cm.lock); + spin_lock_irqsave(&cm.lock, flags); if (!cm_dev->going_down) queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, msecs_to_jiffies(wait_time)); - spin_unlock_irq(&cm.lock); + spin_unlock_irqrestore(&cm.lock, flags); cm_id_priv->timewait_info = NULL; } -- cgit v1.1 From f7f4b23e27f7561330ef13f93dbe8f2dc410efa7 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 4 Jan 2016 10:49:53 +0200 Subject: IB/core: Rename rdma_addr_find_dmac_by_grh rdma_addr_find_dmac_by_grh resolves dmac, vlan_id and if_index and downsteram patch will also add hop_limit as an output parameter, thus we rename it to rdma_addr_find_l2_eth_by_grh. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 7 ++++--- drivers/infiniband/core/verbs.c | 18 +++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 791cc98..af1d040 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -541,8 +541,9 @@ static void resolve_cb(int status, struct sockaddr *src_addr, complete(&((struct resolve_cb_context *)context)->comp); } -int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int *if_index) +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, u16 *vlan_id, int *if_index) { int ret = 0; struct rdma_dev_addr dev_addr; @@ -584,7 +585,7 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi dev_put(dev); return ret; } -EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); +EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) { diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 95fccea..97cbc96 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -467,11 +467,11 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (!idev) return -ENODEV; - ret = rdma_addr_find_dmac_by_grh(&dgid, &sgid, - ah_attr->dmac, - wc->wc_flags & IB_WC_WITH_VLAN ? - NULL : &vlan_id, - &if_index); + ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, + &if_index); if (ret) { dev_put(idev); return ret; @@ -1158,10 +1158,10 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, ifindex = sgid_attr.ndev->ifindex; - ret = rdma_addr_find_dmac_by_grh(&sgid, - &qp_attr->ah_attr.grh.dgid, - qp_attr->ah_attr.dmac, - NULL, &ifindex); + ret = rdma_addr_find_l2_eth_by_grh(&sgid, + &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, + NULL, &ifindex); dev_put(sgid_attr.ndev); } -- cgit v1.1 From c3efe7500add077f79d37b18e9c66df6621409b6 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 4 Jan 2016 10:49:54 +0200 Subject: IB/core: Use hop-limit from IP stack for RoCE Previously, IPV6_DEFAULT_HOPLIMIT was used as the hop limit value for RoCE. Fixing that by taking ip4_dst_hoplimit and ip6_dst_hoplimit as hop limit values. Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 9 ++++++++- drivers/infiniband/core/cm.c | 1 + drivers/infiniband/core/cma.c | 12 +++++------- drivers/infiniband/core/verbs.c | 16 +++++++--------- 4 files changed, 21 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index af1d040..337353d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -252,6 +252,8 @@ static int addr4_resolve(struct sockaddr_in *src_in, if (rt->rt_uses_gateway) addr->network = RDMA_NETWORK_IPV4; + addr->hoplimit = ip4_dst_hoplimit(&rt->dst); + *prt = rt; return 0; out: @@ -295,6 +297,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, if (rt->rt6i_flags & RTF_GATEWAY) addr->network = RDMA_NETWORK_IPV6; + addr->hoplimit = ip6_dst_hoplimit(dst); + *pdst = dst; return 0; put: @@ -543,7 +547,8 @@ static void resolve_cb(int status, struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int *if_index) + u8 *dmac, u16 *vlan_id, int *if_index, + int *hoplimit) { int ret = 0; struct rdma_dev_addr dev_addr; @@ -582,6 +587,8 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, *if_index = dev_addr.bound_dev_if; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); + if (hoplimit) + *hoplimit = dev_addr.hoplimit; dev_put(dev); return ret; } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 0ba0463..1d92e09 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1641,6 +1641,7 @@ static int cm_req_handler(struct cm_work *work) cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); + work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit; ret = ib_get_cached_gid(work->port->cm_dev->ib_device, work->port->port_num, cm_id_priv->av.ah_attr.grh.sgid_index, diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a811594..bbcfa76 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2424,7 +2424,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; - enum ib_gid_type network_gid_type; struct cma_work *work; int ret; struct net_device *ndev = NULL; @@ -2478,14 +2477,13 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) &route->path_rec->dgid); /* Use the hint from IP Stack to select GID Type */ - network_gid_type = ib_network_to_gid_type(addr->dev_addr.network); - if (addr->dev_addr.network != RDMA_NETWORK_IB) { - route->path_rec->gid_type = network_gid_type; + if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network); + if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ - route->path_rec->hop_limit = IPV6_DEFAULT_HOPLIMIT; - } else { + route->path_rec->hop_limit = addr->dev_addr.hoplimit; + else route->path_rec->hop_limit = 1; - } route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 97cbc96..5af6d02 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -434,6 +434,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, int ret; enum rdma_network_type net_type = RDMA_NETWORK_IB; enum ib_gid_type gid_type = IB_GID_TYPE_IB; + int hoplimit = 0xff; union ib_gid dgid; union ib_gid sgid; @@ -471,7 +472,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, ah_attr->dmac, wc->wc_flags & IB_WC_WITH_VLAN ? NULL : &vlan_id, - &if_index); + &if_index, &hoplimit); if (ret) { dev_put(idev); return ret; @@ -520,7 +521,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); ah_attr->grh.flow_label = flow_class & 0xFFFFF; - ah_attr->grh.hop_limit = 0xFF; + ah_attr->grh.hop_limit = hoplimit; ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; } return 0; @@ -1138,6 +1139,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, union ib_gid sgid; struct ib_gid_attr sgid_attr; int ifindex; + int hop_limit; ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num, @@ -1149,21 +1151,17 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, ret = -ENXIO; goto out; } - if (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - /* TODO: get the hoplimit from the inet/inet6 - * device - */ - qp_attr->ah_attr.grh.hop_limit = - IPV6_DEFAULT_HOPLIMIT; ifindex = sgid_attr.ndev->ifindex; ret = rdma_addr_find_l2_eth_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, qp_attr->ah_attr.dmac, - NULL, &ifindex); + NULL, &ifindex, &hop_limit); dev_put(sgid_attr.ndev); + + qp_attr->ah_attr.grh.hop_limit = hop_limit; } } out: -- cgit v1.1 From 3ef967a4affeef7bb3b7713dcfed6518b99737c6 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 14 Jan 2016 17:50:41 +0200 Subject: IB/mlx4: Enable send of RoCE QP1 packets with IP/UDP headers RoCEv2 packets are sent over IP/UDP protocols. The mlx4 driver uses a type of RAW QP to send packets for QP1 and therefore needs to build the network headers below BTH in software. This patch adds option to build QP1 packets with IP and UDP headers if RoCEv2 is requested. Signed-off-by: Moni Shoua Signed-off-by: Doug Ledford --- drivers/infiniband/core/ud_header.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 96697e7..19837d2 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -280,7 +280,7 @@ static const struct ib_field deth_table[] = { .size_bits = 24 } }; -__be16 ib_ud_ip4_csum(struct ib_ud_header *header) +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header) { struct iphdr iph; -- cgit v1.1 From 2deeb4772971e56d5bdac0bd3375d5eadaa827fd Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 21 Jan 2016 08:41:31 -0500 Subject: IB/sa: Fix netlink local service GFP crash The rdma netlink local service registers a handler to handle RESOLVE response and another handler to handle SET_TIMEOUT request. The first thing these handlers do is to call netlink_capable() to check the access right of the received skb to make sure that the sender has root access. Under normal conditions, such responses and requests will be directly forwarded to the handlers without going through the netlink_dump pathway (see ibnl_rcv_msg() in drivers/infiniband/core/netlink.c). However, a user application could send a RESOLVE request (not response) to the local service, which will fall into the netlink_dump pathway, where a new skb will be created without initializing the control block. This new skb will be eventually forwarded to the local service RESOLVE response handler. Unfortunately, netlink_capable() will cause general protection fault if the skb's control block is not initialized. This patch will address the problem by checking the skb first. Signed-off-by: Kaike Wan Reported-by: Dmitry Vyukov Signed-off-by: Doug Ledford --- drivers/infiniband/core/sa_query.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 1f91b6e..f334090 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -717,7 +717,9 @@ static int ib_nl_handle_set_timeout(struct sk_buff *skb, struct nlattr *tb[LS_NLA_TYPE_MAX]; int ret; - if (!netlink_capable(skb, CAP_NET_ADMIN)) + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), @@ -791,7 +793,9 @@ static int ib_nl_handle_resolve_resp(struct sk_buff *skb, int found = 0; int ret; - if (!netlink_capable(skb, CAP_NET_ADMIN)) + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); -- cgit v1.1