From ecdc428e4c5d821a07baf4f8b1718faf67b9026f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 12 Nov 2009 11:14:13 -0800 Subject: IB/mlx4: Remove unneeded code There is no such flag DE - the field is reserved and should be zero. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/qp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 219b103..518d5619 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -897,7 +897,6 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | (to_mlx4_st(ibqp->qp_type) << 16)); - context->flags |= cpu_to_be32(1 << 8); /* DE? */ if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); -- cgit v1.1 From 417608c20a4c8397bc5307d949ec01ea0a0dd8e5 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 12 Nov 2009 11:19:44 -0800 Subject: IB/mlx4: Remove limitation on LSO header size Current code has a limitation: an LSO header is not allowed to cross a 64 byte boundary. This patch removes this limitation by setting the WQE RR for large headers thus allowing LSO headers of any size. The extra buffer reserved for MLX4_IB_QP_LSO QPs has been doubled, from 64 to 128 bytes, assuming this is reasonable upper limit for header length. Also, this patch will cause IB_DEVICE_UD_TSO to be set only for HCA FW versions that set MLX4_DEV_CAP_FLAG_BLH; e.g. FW version 2.6.000 and higher. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx4/qp.c | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 3cb3f47..e596537 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -103,7 +103,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; - if (dev->dev->caps.max_gso_sz) + if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 518d5619..847030c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -54,7 +54,8 @@ enum { /* * Largest possible UD header: send with GRH and immediate data. */ - MLX4_IB_UD_HEADER_SIZE = 72 + MLX4_IB_UD_HEADER_SIZE = 72, + MLX4_IB_LSO_HEADER_SPARE = 128, }; struct mlx4_ib_sqp { @@ -67,7 +68,8 @@ struct mlx4_ib_sqp { }; enum { - MLX4_IB_MIN_SQ_STRIDE = 6 + MLX4_IB_MIN_SQ_STRIDE = 6, + MLX4_IB_CACHE_LINE_SIZE = 64, }; static const __be32 mlx4_ib_opcode[] = { @@ -261,7 +263,7 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) case IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + - ((flags & MLX4_IB_QP_LSO) ? 64 : 0); + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); case IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); @@ -1466,16 +1468,12 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, - __be32 *lso_hdr_sz) + __be32 *lso_hdr_sz, __be32 *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); - /* - * This is a temporary limitation and will be removed in - * a forthcoming FW release: - */ - if (unlikely(halign > 64)) - return -EINVAL; + if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) + *blh = cpu_to_be32(1 << 6); if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && wr->num_sge > qp->sq.max_gs - (halign >> 4))) @@ -1521,6 +1519,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 dummy; __be32 *lso_wqe; __be32 uninitialized_var(lso_hdr_sz); + __be32 blh; int i; spin_lock_irqsave(&qp->sq.lock, flags); @@ -1529,6 +1528,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, for (nreq = 0; wr; ++nreq, wr = wr->next) { lso_wqe = &dummy; + blh = 0; if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { err = -ENOMEM; @@ -1615,7 +1615,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (wr->opcode == IB_WR_LSO) { - err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz); + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); if (unlikely(err)) { *bad_wr = wr; goto out; @@ -1686,7 +1686,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | - (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); -- cgit v1.1 From c1ccaf2478f84c2665cf57f981db143aa582d646 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 12 Nov 2009 11:32:27 -0800 Subject: IB/iser: Rewrite SG handling for RDMA logic After dma-mapping an SG list provided by the SCSI midlayer, iser has to make sure the mapped SG is "aligned for RDMA" in the sense that its possible to produce one mapping in the HCA IOMMU which represents the whole SG. Next, the mapped SG is formatted for registration with the HCA. This patch re-writes the logic that does the above, to make it clearer and simpler. It also fixes a bug in the being aligned for RDMA checks, where a "start" check wasn't done but rather only "end" check. Signed-off-by: Alexander Nezhinsky Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_memory.c | 122 ++++++++++++++---------------- 1 file changed, 56 insertions(+), 66 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index b9453d0..274c883 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -209,6 +209,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, mem_copy->copy_buf = NULL; } +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) + /** * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses * and returns the length of resulting physical address array (may be less than @@ -221,62 +223,52 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, * where --few fragments of the same page-- are present in the SG as * consecutive elements. Also, it handles one entry SG. */ + static int iser_sg_to_page_vec(struct iser_data_buf *data, struct iser_page_vec *page_vec, struct ib_device *ibdev) { - struct scatterlist *sgl = (struct scatterlist *)data->buf; - struct scatterlist *sg; - u64 first_addr, last_addr, page; - int end_aligned; - unsigned int cur_page = 0; + struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; + u64 start_addr, end_addr, page, chunk_start = 0; unsigned long total_sz = 0; - int i; + unsigned int dma_len; + int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; /* compute the offset of first element */ page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; + new_chunk = 1; + cur_page = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - + start_addr = ib_sg_dma_address(ibdev, sg); + if (new_chunk) + chunk_start = start_addr; + dma_len = ib_sg_dma_len(ibdev, sg); + end_addr = start_addr + dma_len; total_sz += dma_len; - first_addr = ib_sg_dma_address(ibdev, sg); - last_addr = first_addr + dma_len; - - end_aligned = !(last_addr & ~MASK_4K); - - /* continue to collect page fragments till aligned or SG ends */ - while (!end_aligned && (i + 1 < data->dma_nents)) { - sg = sg_next(sg); - i++; - dma_len = ib_sg_dma_len(ibdev, sg); - total_sz += dma_len; - last_addr = ib_sg_dma_address(ibdev, sg) + dma_len; - end_aligned = !(last_addr & ~MASK_4K); + /* collect page fragments until aligned or end of SG list */ + if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { + new_chunk = 0; + continue; } - - /* handle the 1st page in the 1st DMA element */ - if (cur_page == 0) { - page = first_addr & MASK_4K; - page_vec->pages[cur_page] = page; - cur_page++; + new_chunk = 1; + + /* address of the first page in the contiguous chunk; + masking relevant for the very first SG entry, + which might be unaligned */ + page = chunk_start & MASK_4K; + do { + page_vec->pages[cur_page++] = page; page += SIZE_4K; - } else - page = first_addr; - - for (; page < last_addr; page += SIZE_4K) { - page_vec->pages[cur_page] = page; - cur_page++; - } - + } while (page < end_addr); } + page_vec->data_size = total_sz; iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); return cur_page; } -#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) /** * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned @@ -284,42 +276,40 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, * the number of entries which are aligned correctly. Supports the case where * consecutive SG elements are actually fragments of the same physcial page. */ -static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data, - struct ib_device *ibdev) +static int iser_data_buf_aligned_len(struct iser_data_buf *data, + struct ib_device *ibdev) { - struct scatterlist *sgl, *sg; - u64 end_addr, next_addr; - int i, cnt; - unsigned int ret_len = 0; + struct scatterlist *sgl, *sg, *next_sg = NULL; + u64 start_addr, end_addr; + int i, ret_len, start_check = 0; + + if (data->dma_nents == 1) + return 1; sgl = (struct scatterlist *)data->buf; + start_addr = ib_sg_dma_address(ibdev, sgl); - cnt = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " - "offset: %ld sz: %ld\n", i, - (unsigned long)sg_phys(sg), - (unsigned long)sg->offset, - (unsigned long)sg->length); */ - end_addr = ib_sg_dma_address(ibdev, sg) + - ib_sg_dma_len(ibdev, sg); - /* iser_dbg("Checking sg iobuf end address " - "0x%08lX\n", end_addr); */ - if (i + 1 < data->dma_nents) { - next_addr = ib_sg_dma_address(ibdev, sg_next(sg)); - /* are i, i+1 fragments of the same page? */ - if (end_addr == next_addr) { - cnt++; - continue; - } else if (!IS_4K_ALIGNED(end_addr)) { - ret_len = cnt + 1; - break; - } - } - cnt++; + if (start_check && !IS_4K_ALIGNED(start_addr)) + break; + + next_sg = sg_next(sg); + if (!next_sg) + break; + + end_addr = start_addr + ib_sg_dma_len(ibdev, sg); + start_addr = ib_sg_dma_address(ibdev, next_sg); + + if (end_addr == start_addr) { + start_check = 0; + continue; + } else + start_check = 1; + + if (!IS_4K_ALIGNED(end_addr)) + break; } - if (i == data->dma_nents) - ret_len = cnt; /* loop ended */ + ret_len = (next_sg) ? i : i+1; iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", ret_len, data->dma_nents, data); return ret_len; -- cgit v1.1 From a7ca1f00ed2921b804d7ebda0f6fca8c9078fa42 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 16 Nov 2009 09:30:33 -0800 Subject: RDMA/ucma: Add option to manually set IB path Export rdma_set_ib_paths to user space to allow applications to manually set the IB path used for connections. This allows alternative ways for a user space application or library to obtain path record information, including retrieving path information from cached data, avoiding direct interaction with the IB SA. The IB SA is a single, centralized entity that can limit scaling on large clusters running MPI applications. Future changes to the rdma cm can expand on this framework to support the full range of features allowed by the IB CM, such as separate forward and reverse paths and APM. Signed-off-by: Sean Hefty Reviewed-By: Jason Gunthorpe Signed-off-by: Roland Dreier --- drivers/infiniband/core/sa_query.c | 6 +++++ drivers/infiniband/core/ucma.c | 49 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8254371..7e1ffd8c 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -604,6 +604,12 @@ retry: return ret ? ret : id; } +void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec) +{ + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); +} +EXPORT_SYMBOL(ib_sa_unpack_path); + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index bb96d3c..f1cbd26 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -43,6 +43,7 @@ #include #include #include +#include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -812,6 +813,51 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, return ret; } +static int ucma_set_ib_path(struct ucma_context *ctx, + struct ib_path_rec_data *path_data, size_t optlen) +{ + struct ib_sa_path_rec sa_path; + struct rdma_cm_event event; + int ret; + + if (optlen % sizeof(*path_data)) + return -EINVAL; + + for (; optlen; optlen -= sizeof(*path_data), path_data++) { + if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL)) + break; + } + + if (!optlen) + return -EINVAL; + + ib_sa_unpack_path(path_data->path_rec, &sa_path); + ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + if (ret) + return ret; + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + return ucma_event_handler(ctx->cm_id, &event); +} + +static int ucma_set_option_ib(struct ucma_context *ctx, int optname, + void *optval, size_t optlen) +{ + int ret; + + switch (optname) { + case RDMA_OPTION_IB_PATH: + ret = ucma_set_ib_path(ctx, optval, optlen); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + static int ucma_set_option_level(struct ucma_context *ctx, int level, int optname, void *optval, size_t optlen) { @@ -821,6 +867,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level, case RDMA_OPTION_ID: ret = ucma_set_option_id(ctx, optname, optval, optlen); break; + case RDMA_OPTION_IB: + ret = ucma_set_option_ib(ctx, optname, optval, optlen); + break; default: ret = -ENOSYS; } -- cgit v1.1 From 0f9ea5d2ab5cef732d5abbe62b9e9af3007bae81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2009 14:24:34 -0800 Subject: RDMA/addr: Use appropriate locking with for_each_netdev() for_each_netdev() should be used with RTNL or dev_base_lock held, or else we risk a crash. Signed-off-by: Eric Dumazet Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index bd07803..373f111 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -131,6 +131,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: + read_lock(&dev_base_lock); for_each_netdev(&init_net, dev) { if (ipv6_chk_addr(&init_net, &((struct sockaddr_in6 *) addr)->sin6_addr, @@ -139,6 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) break; } } + read_unlock(&dev_base_lock); break; #endif } @@ -391,14 +393,17 @@ static int addr_resolve_local(struct sockaddr *src_in, { struct in6_addr *a; + read_lock(&dev_base_lock); for_each_netdev(&init_net, dev) if (ipv6_chk_addr(&init_net, &((struct sockaddr_in6 *) dst_in)->sin6_addr, dev, 1)) break; - if (!dev) + if (!dev) { + read_unlock(&dev_base_lock); return -EADDRNOTAVAIL; + } a = &((struct sockaddr_in6 *) src_in)->sin6_addr; @@ -416,6 +421,7 @@ static int addr_resolve_local(struct sockaddr *src_in, if (!ret) memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); } + read_unlock(&dev_base_lock); break; } #endif -- cgit v1.1 From 1c9b281997b5876c0c8ed62506b56db89d262b57 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 19 Nov 2009 12:55:21 -0800 Subject: RDMA/cma: Correct detection of SA Created MGID RDMA CM treats AF_INET6 addresses that are either 0 or prefixed with FF1x:A01B::/32 as MGIDs, but the detection for the prefix was buggy; fix it up. Signed-off-by: Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0753178..8bb2cf4 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2687,7 +2687,7 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && - ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFF10A01B) == + ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); -- cgit v1.1 From e2e626972e652d18520f84d69fc06cfa307d11ff Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 19 Nov 2009 12:55:22 -0800 Subject: RDMA/cma: Fix AF_INET6 support in multicast joining If joining to an AF_INET6 address, we need to map the address to a MGID in the same way as the IP stack. The old code would just fall through to the IPv4 case and generate garbage. Signed-off-by: Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 8bb2cf4..052b4c0 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2691,6 +2691,11 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if ((addr->sa_family == AF_INET6)) { + ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); + if (id_priv->id.ps == RDMA_PS_UDP) + mc_map[7] = 0x01; /* Use RDMA CM signature */ + *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) -- cgit v1.1 From 6266ed6e4164466177238b11ecb825a3a108a3e4 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 12:55:22 -0800 Subject: RDMA/cma: Replace net_device pointer with index Provide the device interface when resolving route information to ensure that the correct outbound device is used. This will also simplify processing of sin6_scope_id for IPv6 support. Based on work from: David Wilder Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 14 +++++++++++++- drivers/infiniband/core/cma.c | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 373f111..788a02e 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -107,7 +107,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); if (dst_dev_addr) memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); - dev_addr->src_dev = dev; + dev_addr->bound_dev_if = dev->ifindex; return 0; } EXPORT_SYMBOL(rdma_copy_addr); @@ -117,6 +117,15 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) struct net_device *dev; int ret = -EADDRNOTAVAIL; + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; + } + switch (addr->sa_family) { case AF_INET: dev = ip_dev_find(&init_net, @@ -231,6 +240,8 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, memset(&fl, 0, sizeof fl); fl.nl_u.ip4_u.daddr = dst_ip; fl.nl_u.ip4_u.saddr = src_ip; + fl.oif = addr->bound_dev_if; + ret = ip_route_output_key(&init_net, &rt, &fl); if (ret) goto out; @@ -279,6 +290,7 @@ static int addr6_resolve_remote(struct sockaddr_in6 *src_in, memset(&fl, 0, sizeof fl); fl.nl_u.ip6_u.daddr = dst_in->sin6_addr; fl.nl_u.ip6_u.saddr = src_in->sin6_addr; + fl.oif = addr->bound_dev_if; dst = ip6_route_output(&init_net, NULL, &fl); if (!dst) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 052b4c0..699ad12 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2820,7 +2820,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id dev_addr = &id_priv->id.route.addr.dev_addr; - if ((dev_addr->src_dev == ndev) && + if ((dev_addr->bound_dev_if == ndev->ifindex) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); -- cgit v1.1 From d2e0886245aa9eebc1a4710c861d263b09eac493 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 12:55:22 -0800 Subject: IB/addr: Verify source and destination address families match If a source address is provided, verify that the address family matches that of the destination address. If the source is not specified, use the same address family as the destination. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 788a02e..b59ba7c 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -461,18 +461,27 @@ int rdma_resolve_ip(struct rdma_addr_client *client, if (!req) return -ENOMEM; - if (src_addr) - memcpy(&req->src_addr, src_addr, ip_addr_size(src_addr)); - memcpy(&req->dst_addr, dst_addr, ip_addr_size(dst_addr)); + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) { + ret = -EINVAL; + goto err; + } + + memcpy(src_in, src_addr, ip_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + memcpy(dst_in, dst_addr, ip_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; req->client = client; atomic_inc(&client->refcount); - src_in = (struct sockaddr *) &req->src_addr; - dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve_local(src_in, dst_in, addr); if (req->status == -EADDRNOTAVAIL) req->status = addr_resolve_remote(src_in, dst_in, addr); @@ -490,10 +499,12 @@ int rdma_resolve_ip(struct rdma_addr_client *client, default: ret = req->status; atomic_dec(&client->refcount); - kfree(req); - break; + goto err; } return ret; +err: + kfree(req); + return ret; } EXPORT_SYMBOL(rdma_resolve_ip); -- cgit v1.1 From c4315d85f9b76834289fd503796c01b8311c4b84 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 12:57:18 -0800 Subject: IB/addr: Store net_device type instead of translating to RDMA transport The struct rdma_dev_addr stores net_device address information: the source device address, destination hardware address, and broadcast address. For consistency, store the net_device type rather than converting it to the rdma_node_type. The type indicates the format of the various hardware addresses, which is what we're concerned with, and not the RDMA node type that the address may map to. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 13 +------------ drivers/infiniband/core/cma.c | 6 +++--- 2 files changed, 4 insertions(+), 15 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index b59ba7c..de5fe16 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -92,17 +91,7 @@ EXPORT_SYMBOL(rdma_addr_unregister_client); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, const unsigned char *dst_dev_addr) { - switch (dev->type) { - case ARPHRD_INFINIBAND: - dev_addr->dev_type = RDMA_NODE_IB_CA; - break; - case ARPHRD_ETHER: - dev_addr->dev_type = RDMA_NODE_RNIC; - break; - default: - return -EADDRNOTAVAIL; - } - + dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); if (dst_dev_addr) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 699ad12..b305b5c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -330,11 +330,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) union ib_gid gid; int ret = -ENODEV; - switch (rdma_node_get_transport(dev_addr->dev_type)) { - case RDMA_TRANSPORT_IB: + switch (dev_addr->dev_type) { + case ARPHRD_INFINIBAND: ib_addr_get_sgid(dev_addr, &gid); break; - case RDMA_TRANSPORT_IWARP: + case ARPHRD_ETHER: iw_addr_get_sgid(dev_addr, &gid); break; default: -- cgit v1.1 From 6f8372b69c3198e06cecb1df2cb9682d0c55e657 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 13:26:06 -0800 Subject: RDMA/cm: fix loopback address support The RDMA CM is intended to support the use of a loopback address when establishing a connection; however, the behavior of the CM when loopback addresses are used is confusing and does not always work, depending on whether loopback was specified by the server, the client, or both. The defined behavior of rdma_bind_addr is to associate an RDMA device with an rdma_cm_id, as long as the user specified a non- zero address. (ie they weren't just trying to reserve a port) Currently, if the loopback address is passed to rdam_bind_addr, no device is associated with the rdma_cm_id. Fix this. If a loopback address is specified by the client as the destination address for a connection, it will fail to establish a connection. This is true even if the server is listing across all addresses or on the loopback address itself. The issue is that the server tries to translate the IP address carried in the REQ message to a local net_device address, which fails. The translation is not needed in this case, since the REQ carries the actual HW address that should be used. Finally, cleanup loopback support to be more transport neutral. Replace separate calls to get/set the sgid and dgid from the device address to a single call that behaves correctly depending on the format of the device address. And support both IPv4 and IPv6 address formats. Signed-off-by: Sean Hefty [ Fixed RDS build by s/ib_addr_get/rdma_addr_get/ - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 77 +++++++++++++++++++++++------------------- drivers/infiniband/core/ucma.c | 8 ++--- 2 files changed, 47 insertions(+), 38 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index b305b5c..38867a4 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -330,17 +330,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) union ib_gid gid; int ret = -ENODEV; - switch (dev_addr->dev_type) { - case ARPHRD_INFINIBAND: - ib_addr_get_sgid(dev_addr, &gid); - break; - case ARPHRD_ETHER: - iw_addr_get_sgid(dev_addr, &gid); - break; - default: - return -ENODEV; - } - + rdma_addr_get_sgid(dev_addr, &gid); list_for_each_entry(cma_dev, &dev_list, list) { ret = ib_find_cached_gid(cma_dev->device, &gid, &id_priv->id.port_num, NULL); @@ -1032,11 +1022,17 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr); - if (ret) - goto destroy_id; + if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { + rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey); + } else { + ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, + &rt->addr.dev_addr); + if (ret) + goto destroy_id; + } + rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = CMA_CONNECT; @@ -1071,10 +1067,12 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr); - if (ret) - goto err; + if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { + ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, + &id->route.addr.dev_addr); + if (ret) + goto err; + } id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = CMA_CONNECT; @@ -1565,8 +1563,8 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct sockaddr_in6 *sin6; memset(&path_rec, 0, sizeof path_rec); - ib_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); - ib_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); + rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; @@ -1781,7 +1779,11 @@ port_found: if (ret) goto out; - ib_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); + id_priv->id.route.addr.dev_addr.dev_type = + (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB) ? + ARPHRD_INFINIBAND : ARPHRD_ETHER; + + rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); @@ -1839,7 +1841,7 @@ out: static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; - struct sockaddr_in *src_in, *dst_in; + struct sockaddr *src, *dst; union ib_gid gid; int ret; @@ -1853,14 +1855,19 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) goto err; } - ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); - ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); + rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - if (cma_zero_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) { - src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr; - dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr; - src_in->sin_family = dst_in->sin_family; - src_in->sin_addr.s_addr = dst_in->sin_addr.s_addr; + src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; + if (cma_zero_addr(src)) { + dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; + if ((src->sa_family = dst->sa_family) == AF_INET) { + ((struct sockaddr_in *) src)->sin_addr.s_addr = + ((struct sockaddr_in *) dst)->sin_addr.s_addr; + } else { + ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr, + &((struct sockaddr_in6 *) dst)->sin6_addr); + } } work->id = id_priv; @@ -2089,7 +2096,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; - if (!cma_any_addr(addr)) { + if (cma_loopback_addr(addr)) { + ret = cma_bind_loopback(id_priv); + } else if (!cma_zero_addr(addr)) { ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); if (ret) goto err1; @@ -2108,7 +2117,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (!cma_any_addr(addr)) { + if (id_priv->cma_dev) { mutex_lock(&lock); cma_detach_from_dev(id_priv); mutex_unlock(&lock); @@ -2721,7 +2730,7 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); if (id_priv->id.ps == RDMA_PS_UDP) rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); - ib_addr_get_sgid(dev_addr, &rec.port_gid); + rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = 1; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index f1cbd26..b2e16c3 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -563,10 +563,10 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, switch (route->num_paths) { case 0: dev_addr = &route->addr.dev_addr; - ib_addr_get_dgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].dgid); - ib_addr_get_sgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].sgid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].dgid); + rdma_addr_get_sgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); break; case 2: -- cgit v1.1 From 923c100ef019bf15fb89b6fa3d3ad0485d25d59b Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 13:26:51 -0800 Subject: IB/addr: Simplify resolving IPv4 addresses Merge resolve local/remote address resolution into a single data flow to ensure consistent access and use of the local routing tables. Based on work from: David Wilder Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 81 ++++++++++++------------------------------ 1 file changed, 23 insertions(+), 58 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index de5fe16..38a7184 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -184,17 +184,6 @@ static void addr_send_arp(struct sockaddr *dst_in) memset(&fl, 0, sizeof fl); switch (dst_in->sa_family) { - case AF_INET: - fl.nl_u.ip4_u.daddr = - ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - - if (ip_route_output_key(&init_net, &rt, &fl)) - return; - - neigh_event_send(rt->u.dst.neighbour, NULL); - ip_rt_put(rt); - break; - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: { @@ -215,9 +204,9 @@ static void addr_send_arp(struct sockaddr *dst_in) } } -static int addr4_resolve_remote(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, - struct rdma_dev_addr *addr) +static int addr4_resolve(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) { __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; @@ -235,6 +224,16 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, if (ret) goto out; + src_in->sin_family = AF_INET; + src_in->sin_addr.s_addr = rt->rt_src; + + if (rt->idev->dev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + goto put; + } + /* If the device does ARP internally, return 'done' */ if (rt->idev->dev->flags & IFF_NOARP) { rdma_copy_addr(addr, rt->idev->dev, NULL); @@ -242,21 +241,14 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, } neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev); - if (!neigh) { + if (!neigh || !(neigh->nud_state & NUD_VALID)) { + neigh_event_send(rt->u.dst.neighbour, NULL); ret = -ENODATA; + if (neigh) + goto release; goto put; } - if (!(neigh->nud_state & NUD_VALID)) { - ret = -ENODATA; - goto release; - } - - if (!src_ip) { - src_in->sin_family = dst_in->sin_family; - src_in->sin_addr.s_addr = rt->rt_src; - } - ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); release: neigh_release(neigh); @@ -305,12 +297,12 @@ static int addr6_resolve_remote(struct sockaddr_in6 *src_in, } #endif -static int addr_resolve_remote(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) +static int addr_resolve(struct sockaddr *src_in, + struct sockaddr *dst_in, + struct rdma_dev_addr *addr) { if (src_in->sa_family == AF_INET) { - return addr4_resolve_remote((struct sockaddr_in *) src_in, + return addr4_resolve((struct sockaddr_in *) src_in, (struct sockaddr_in *) dst_in, addr); } else return addr6_resolve_remote((struct sockaddr_in6 *) src_in, @@ -330,8 +322,7 @@ static void process_req(struct work_struct *work) if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve_remote(src_in, dst_in, - req->addr); + req->status = addr_resolve(src_in, dst_in, req->addr); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -363,32 +354,6 @@ static int addr_resolve_local(struct sockaddr *src_in, int ret; switch (dst_in->sa_family) { - case AF_INET: - { - __be32 src_ip = ((struct sockaddr_in *) src_in)->sin_addr.s_addr; - __be32 dst_ip = ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - - dev = ip_dev_find(&init_net, dst_ip); - if (!dev) - return -EADDRNOTAVAIL; - - if (ipv4_is_zeronet(src_ip)) { - src_in->sa_family = dst_in->sa_family; - ((struct sockaddr_in *) src_in)->sin_addr.s_addr = dst_ip; - ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (ipv4_is_loopback(src_ip)) { - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } else { - ret = rdma_translate_ip(src_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } - dev_put(dev); - break; - } - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: { @@ -473,7 +438,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->status = addr_resolve_local(src_in, dst_in, addr); if (req->status == -EADDRNOTAVAIL) - req->status = addr_resolve_remote(src_in, dst_in, addr); + req->status = addr_resolve(src_in, dst_in, addr); switch (req->status) { case 0: -- cgit v1.1 From d14714df61681cfecf945a58436edf197327e87f Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 16:46:25 -0800 Subject: IB/addr: Fix IPv6 routing lookup Include link scope as part of address resolution. Combine local and remote address resolution into a single, simpler code path. Fix error checking in the IPv6 routing lookups. Based on work from: David Wilder Jason Gunthorpe Signed-off-by: Sean Hefty [ Fix up cma_check_linklocal() for !IPV6 case. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 144 ++++++++++++----------------------------- drivers/infiniband/core/cma.c | 47 ++++++++++---- 2 files changed, 74 insertions(+), 117 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 38a7184..abbb069 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -176,34 +176,6 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static void addr_send_arp(struct sockaddr *dst_in) -{ - struct rtable *rt; - struct flowi fl; - - memset(&fl, 0, sizeof fl); - - switch (dst_in->sa_family) { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case AF_INET6: - { - struct dst_entry *dst; - - fl.nl_u.ip6_u.daddr = - ((struct sockaddr_in6 *) dst_in)->sin6_addr; - - dst = ip6_route_output(&init_net, NULL, &fl); - if (!dst) - return; - - neigh_event_send(dst->neighbour, NULL); - dst_release(dst); - break; - } -#endif - } -} - static int addr4_resolve(struct sockaddr_in *src_in, struct sockaddr_in *dst_in, struct rdma_dev_addr *addr) @@ -259,39 +231,63 @@ out: } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int addr6_resolve_remote(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) +static int addr6_resolve(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) { struct flowi fl; struct neighbour *neigh; struct dst_entry *dst; - int ret = -ENODATA; + int ret; memset(&fl, 0, sizeof fl); - fl.nl_u.ip6_u.daddr = dst_in->sin6_addr; - fl.nl_u.ip6_u.saddr = src_in->sin6_addr; + ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr); + ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr); fl.oif = addr->bound_dev_if; dst = ip6_route_output(&init_net, NULL, &fl); - if (!dst) - return ret; + if ((ret = dst->error)) + goto put; + + if (ipv6_addr_any(&fl.fl6_src)) { + ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, + &fl.fl6_dst, 0, &fl.fl6_src); + if (ret) + goto put; + src_in->sin6_family = AF_INET6; + ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src); + } + + if (dst->dev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + goto put; + } + + /* If the device does ARP internally, return 'done' */ if (dst->dev->flags & IFF_NOARP) { ret = rdma_copy_addr(addr, dst->dev, NULL); - } else { - neigh = dst->neighbour; - if (neigh && (neigh->nud_state & NUD_VALID)) - ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); + goto put; } + neigh = dst->neighbour; + if (!neigh || !(neigh->nud_state & NUD_VALID)) { + neigh_event_send(dst->neighbour, NULL); + ret = -ENODATA; + goto put; + } + + ret = rdma_copy_addr(addr, dst->dev, neigh->ha); +put: dst_release(dst); return ret; } #else -static int addr6_resolve_remote(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) +static int addr6_resolve(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) { return -EADDRNOTAVAIL; } @@ -305,7 +301,7 @@ static int addr_resolve(struct sockaddr *src_in, return addr4_resolve((struct sockaddr_in *) src_in, (struct sockaddr_in *) dst_in, addr); } else - return addr6_resolve_remote((struct sockaddr_in6 *) src_in, + return addr6_resolve((struct sockaddr_in6 *) src_in, (struct sockaddr_in6 *) dst_in, addr); } @@ -346,60 +342,6 @@ static void process_req(struct work_struct *work) } } -static int addr_resolve_local(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) -{ - struct net_device *dev; - int ret; - - switch (dst_in->sa_family) { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case AF_INET6: - { - struct in6_addr *a; - - read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) - if (ipv6_chk_addr(&init_net, - &((struct sockaddr_in6 *) dst_in)->sin6_addr, - dev, 1)) - break; - - if (!dev) { - read_unlock(&dev_base_lock); - return -EADDRNOTAVAIL; - } - - a = &((struct sockaddr_in6 *) src_in)->sin6_addr; - - if (ipv6_addr_any(a)) { - src_in->sa_family = dst_in->sa_family; - ((struct sockaddr_in6 *) src_in)->sin6_addr = - ((struct sockaddr_in6 *) dst_in)->sin6_addr; - ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (ipv6_addr_loopback(a)) { - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } else { - ret = rdma_translate_ip(src_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } - read_unlock(&dev_base_lock); - break; - } -#endif - - default: - ret = -EADDRNOTAVAIL; - break; - } - - return ret; -} - int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, @@ -436,10 +378,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->client = client; atomic_inc(&client->refcount); - req->status = addr_resolve_local(src_in, dst_in, addr); - if (req->status == -EADDRNOTAVAIL) - req->status = addr_resolve(src_in, dst_in, addr); - + req->status = addr_resolve(src_in, dst_in, addr); switch (req->status) { case 0: req->timeout = jiffies; @@ -448,7 +387,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, case -ENODATA: req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; queue_req(req); - addr_send_arp(dst_in); break; default: ret = req->status; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 38867a4..fbdd731 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1472,15 +1472,6 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af) -{ - struct sockaddr_storage addr_in; - - memset(&addr_in, 0, sizeof addr_in); - addr_in.ss_family = af; - return rdma_bind_addr(id, (struct sockaddr *) &addr_in); -} - int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv; @@ -1488,7 +1479,8 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == CMA_IDLE) { - ret = cma_bind_any(id, AF_INET); + ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; + ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); if (ret) return ret; } @@ -1885,10 +1877,14 @@ err: static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { - if (src_addr && src_addr->sa_family) - return rdma_bind_addr(id, src_addr); - else - return cma_bind_any(id, dst_addr->sa_family); + if (!src_addr || !src_addr->sa_family) { + src_addr = (struct sockaddr *) &id->route.addr.src_addr; + if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) { + ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = + ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; + } + } + return rdma_bind_addr(id, src_addr); } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -2084,6 +2080,25 @@ static int cma_get_port(struct rdma_id_private *id_priv) return ret; } +static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, + struct sockaddr *addr) +{ +#if defined(CONFIG_IPv6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; + + if (addr->sa_family != AF_INET6) + return 0; + + sin6 = (struct sockaddr_in6 *) addr; + if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && + !sin6->sin6_scope_id) + return -EINVAL; + + dev_addr->bound_dev_if = sin6->sin6_scope_id; +#endif + return 0; +} + int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; @@ -2096,6 +2111,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; + ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + if (ret) + goto err1; + if (cma_loopback_addr(addr)) { ret = cma_bind_loopback(id_priv); } else if (!cma_zero_addr(addr)) { -- cgit v1.1 From 0cd4d0fd9b0a4e10c091fc6316d1bf92885dcd9c Mon Sep 17 00:00:00 2001 From: "David J. Wilder" Date: Wed, 9 Dec 2009 10:03:00 -0800 Subject: IPoIB: Clear ipoib_neigh.dgid in ipoib_neigh_alloc() IPoIB can miss a change in destination GID under some conditions. The problem is caused when ipoib_neigh->dgid contains a stale address. The fix is to set ipoib_neigh->dgid to zero in ipoib_neigh_alloc(). This can happen when a system using bonding on its IPoIB interfaces has switched its active interface from interface A to B and back to A. The system that fails over will not correctly processes the 2nd address change, as described below. When an address has changed neighbor->ha is updated with the new address. Each neighbor has an associated ipoib_neigh. ipoib_neigh->dgid also holds a copy of the remote node's hardware address. When an address changes neighbor->ha is updated by the network layer (arp code) with the new address. IPoIB detects this change in ipoib_start_xmit() by comparing neighbor->ha with ipoib_neigh->dgid. The bug is that ipoib_neigh->dgid may already contain the new address (A) thus the change from B to A is missed by ipoib. Here is the sequence of events: ipoib_neigh->dgid = A and neighbor->ha = A The address is switched to B (the first switch) neighbor->ha = B The change is seen in ipoib_start_xmit() -- neighbor->ha != ipoib_neigh->dgid so ipoib_neigh is released, and a new one is allocated. The allocator may return the same chunk of memory that was just released, therefore ipoib_neigh->dgid still contains A at this point. ipoib_neigh->dgid should be updated in neigh_add_path(), but if the following conditions are true dgid is not updated: 1) __path_find() returns a path 2) path->ah is NULL The remote system now switches from address B to A, neighbor->ha is updated to A. Now we have again : ipoib_neigh->dgid = A and neighbor->ha = A Since the addresses are the same ipoib won't process the change in address. Fix this by zeroing out the dgid field when allocating a new struct ipoib_neigh. Signed-off-by: David Wilder Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 2bf5116..df3eb8c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -884,6 +884,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, neigh->neighbour = neighbour; neigh->dev = dev; + memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); *to_ipoib_neigh(neighbour) = neigh; skb_queue_head_init(&neigh->queue); ipoib_cm_set(neigh, NULL); -- cgit v1.1 From 598cb6f327c99ceaf81c45c32504669b2028712b Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 9 Dec 2009 10:05:28 -0800 Subject: IB/ipath: Use bitmap_weight() Use bitmap_weight() instead of finding all set bits in bitmap by hand. Signed-off-by: Akinobu Mita Cc: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 013d1380..d2787fe 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "ipath_kernel.h" #include "ipath_verbs.h" @@ -1697,7 +1698,7 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, unsigned len, int avail) { unsigned long flags; - unsigned end, cnt = 0, next; + unsigned end, cnt = 0; /* There are two bits per send buffer (busy and generation) */ start *= 2; @@ -1748,12 +1749,7 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, if (dd->ipath_pioupd_thresh) { end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); - next = find_first_bit(dd->ipath_pioavailkernel, end); - while (next < end) { - cnt++; - next = find_next_bit(dd->ipath_pioavailkernel, end, - next + 1); - } + cnt = bitmap_weight(dd->ipath_pioavailkernel, end); } spin_unlock_irqrestore(&ipath_pioavail_lock, flags); -- cgit v1.1 From 9420269428b3dc80c98e52beac60a3976fbef7d2 Mon Sep 17 00:00:00 2001 From: Alexander Schmidt Date: Wed, 9 Dec 2009 10:11:04 -0800 Subject: IB/ehca: Rework destroy_eq() The ibmebus_free_irq() function, which might sleep, was called with interrupts disabled. To fix this, make sure that no interrupts are running by killing the interrupt tasklet. Also lock the shca_list_lock to protect against the poll_eqs_timer running concurrently. Signed-off-by: Alexander Schmidt Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_classes.h | 1 + drivers/infiniband/hw/ehca/ehca_eq.c | 9 ++++++--- drivers/infiniband/hw/ehca/ehca_main.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index c825142..0136abd 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -375,6 +375,7 @@ extern rwlock_t ehca_qp_idr_lock; extern rwlock_t ehca_cq_idr_lock; extern struct idr ehca_qp_idr; extern struct idr ehca_cq_idr; +extern spinlock_t shca_list_lock; extern int ehca_static_rate; extern int ehca_port_act_time; diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c index 523e733c..3b87589 100644 --- a/drivers/infiniband/hw/ehca/ehca_eq.c +++ b/drivers/infiniband/hw/ehca/ehca_eq.c @@ -169,12 +169,15 @@ int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq) unsigned long flags; u64 h_ret; - spin_lock_irqsave(&eq->spinlock, flags); ibmebus_free_irq(eq->ist, (void *)shca); - h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + spin_lock_irqsave(&shca_list_lock, flags); + eq->is_initialized = 0; + spin_unlock_irqrestore(&shca_list_lock, flags); - spin_unlock_irqrestore(&eq->spinlock, flags); + tasklet_kill(&eq->interrupt_task); + + h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq); if (h_ret != H_SUCCESS) { ehca_err(&shca->ib_device, "Can't free EQ resources."); diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index fb2d83c..129a6be 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -123,7 +123,7 @@ DEFINE_IDR(ehca_qp_idr); DEFINE_IDR(ehca_cq_idr); static LIST_HEAD(shca_list); /* list of all registered ehcas */ -static DEFINE_SPINLOCK(shca_list_lock); +DEFINE_SPINLOCK(shca_list_lock); static struct timer_list poll_eqs_timer; -- cgit v1.1 From e5dec39474fac3458ad6a649eab8cabfc977ae87 Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Wed, 9 Dec 2009 13:51:36 -0800 Subject: RDMA/nes: In nes_post_send() always set bad_wr on error On error, set bad_wr in nes_post_send(). Stop processing ib_wr queue when an error is detected. Signed-off-by: Frank Zago Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index a680c42..25b52d2 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3386,8 +3386,10 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, wqe_count = 0; total_payload_length = 0; - if (nesqp->ibqp_state > IB_QPS_RTS) - return -EINVAL; + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } spin_lock_irqsave(&nesqp->lock, flags); @@ -3498,6 +3500,9 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, break; } + if (err) + break; + if (ib_wr->send_flags & IB_SEND_SIGNALED) { wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; } @@ -3522,6 +3527,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, spin_unlock_irqrestore(&nesqp->lock, flags); +out: if (err) *bad_wr = ib_wr; return err; -- cgit v1.1 From 4293fdc115e1e4f83dcb9ec6cbd3a54c563835f0 Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Wed, 9 Dec 2009 13:51:36 -0800 Subject: RDMA/nes: In nes_post_recv() always set bad_wr on error On error, set bad_wr in nes_post_recv(). Stop processing ib_wr queue when an error is detected. Signed-off-by: Frank Zago Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 25b52d2..0b17c01 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3554,8 +3554,10 @@ static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, u32 counter; u32 total_payload_length; - if (nesqp->ibqp_state > IB_QPS_RTS) - return -EINVAL; + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } spin_lock_irqsave(&nesqp->lock, flags); @@ -3618,6 +3620,7 @@ static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, spin_unlock_irqrestore(&nesqp->lock, flags); +out: if (err) *bad_wr = ib_wr; return err; -- cgit v1.1 From 649fe4aeab8c9b90eb31c899791534add0c78e04 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 13:51:37 -0800 Subject: RDMA/nes: Add support for IB_WR_*INV Add support for IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV and IB_WR_LOCAL_INV. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 203 +++++++++++++++++++--------------- 1 file changed, 113 insertions(+), 90 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 0b17c01..499dd78 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3373,18 +3373,12 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, struct nes_device *nesdev = nesvnic->nesdev; struct nes_qp *nesqp = to_nesqp(ibqp); struct nes_hw_qp_wqe *wqe; - int err; + int err = 0; u32 qsize = nesqp->hwqp.sq_size; u32 head; - u32 wqe_misc; - u32 wqe_count; + u32 wqe_misc = 0; + u32 wqe_count = 0; u32 counter; - u32 total_payload_length; - - err = 0; - wqe_misc = 0; - wqe_count = 0; - total_payload_length = 0; if (nesqp->ibqp_state > IB_QPS_RTS) { err = -EINVAL; @@ -3415,91 +3409,117 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, u64temp = (u64)(ib_wr->wr_id); set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp); - switch (ib_wr->opcode) { - case IB_WR_SEND: - if (ib_wr->send_flags & IB_SEND_SOLICITED) { - wqe_misc = NES_IWARP_SQ_OP_SENDSE; - } else { - wqe_misc = NES_IWARP_SQ_OP_SEND; - } - if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { - err = -EINVAL; - break; - } - if (ib_wr->send_flags & IB_SEND_FENCE) { - wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; - } - if ((ib_wr->send_flags & IB_SEND_INLINE) && - ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && - (ib_wr->sg_list[0].length <= 64)) { - memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], - (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, - ib_wr->sg_list[0].length); - wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; - } else { - fill_wqe_sg_send(wqe, ib_wr, 1); - } + switch (ib_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (IB_WR_SEND == ib_wr->opcode) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSE; + else + wqe_misc = NES_IWARP_SQ_OP_SEND; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSEINV; + else + wqe_misc = NES_IWARP_SQ_OP_SENDINV; - break; - case IB_WR_RDMA_WRITE: - wqe_misc = NES_IWARP_SQ_OP_RDMAW; - if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { - nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", - ib_wr->num_sge, - nesdev->nesadapter->max_sge); - err = -EINVAL; - break; - } - if (ib_wr->send_flags & IB_SEND_FENCE) { - wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; - } + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); + } - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, - ib_wr->wr.rdma.rkey); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, - ib_wr->wr.rdma.remote_addr); - - if ((ib_wr->send_flags & IB_SEND_INLINE) && - ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && - (ib_wr->sg_list[0].length <= 64)) { - memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], - (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, - ib_wr->sg_list[0].length); - wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; - } else { - fill_wqe_sg_send(wqe, ib_wr, 1); - } - wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = - wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; - break; - case IB_WR_RDMA_READ: - /* iWARP only supports 1 sge for RDMA reads */ - if (ib_wr->num_sge > 1) { - nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", - ib_wr->num_sge); - err = -EINVAL; - break; - } - wqe_misc = NES_IWARP_SQ_OP_RDMAR; - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, - ib_wr->wr.rdma.remote_addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, - ib_wr->wr.rdma.rkey); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, - ib_wr->sg_list->length); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, - ib_wr->sg_list->addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, - ib_wr->sg_list->lkey); - break; - default: - /* error */ - err = -EINVAL; - break; + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; } + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + break; + case IB_WR_RDMA_WRITE: + wqe_misc = NES_IWARP_SQ_OP_RDMAW; + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", + ib_wr->num_sge, nesdev->nesadapter->max_sge); + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* iWARP only supports 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", + ib_wr->num_sge); + err = -EINVAL; + break; + } + if (ib_wr->opcode == IB_WR_RDMA_READ) { + wqe_misc = NES_IWARP_SQ_OP_RDMAR; + } else { + wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); + } + + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, + ib_wr->sg_list->length); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + ib_wr->sg_list->addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, + ib_wr->sg_list->lkey); + break; + case IB_WR_LOCAL_INV: + wqe_misc = NES_IWARP_SQ_OP_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX, + ib_wr->ex.invalidate_rkey); + break; + default: + /* error */ + err = -EINVAL; + break; + } + if (err) break; @@ -3729,6 +3749,9 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) nes_debug(NES_DBG_CQ, "Operation = Send.\n"); entry->opcode = IB_WC_SEND; break; + case NES_IWARP_SQ_OP_LOCINV: + entry->opcode = IB_WR_LOCAL_INV; + break; } nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); -- cgit v1.1 From df42245a3c246ec1eeeedbc3e5edbcc17f081c79 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 9 Dec 2009 14:30:44 -0800 Subject: IB/uverbs: Fix return of PTR_ERR() of wrong pointer in ib_uverbs_get_context() Signed-off-by: Roel Kluin Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 56feab6..112d397 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -285,7 +285,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext = ibdev->alloc_ucontext(ibdev, &udata); if (IS_ERR(ucontext)) { - ret = PTR_ERR(file->ucontext); + ret = PTR_ERR(ucontext); goto err; } -- cgit v1.1 From e147de03610fab7781c09aaed078a932e549ed4a Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Wed, 9 Dec 2009 15:07:25 -0800 Subject: IB/ehca: Fix error paths in post_send and post_recv Always set bad_wr when an immediate error is detected. Do not report success if an error occurred. Signed-off-by: Frank Zago Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_reqs.c | 67 ++++++++++++++++------------------ 1 file changed, 31 insertions(+), 36 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 8fd88cd..e3ec7fd 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -400,7 +400,6 @@ static inline void map_ib_wc_status(u32 cqe_status, static inline int post_one_send(struct ehca_qp *my_qp, struct ib_send_wr *cur_send_wr, - struct ib_send_wr **bad_send_wr, int hidden) { struct ehca_wqe *wqe_p; @@ -412,8 +411,6 @@ static inline int post_one_send(struct ehca_qp *my_qp, wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue); if (unlikely(!wqe_p)) { /* too many posted work requests: queue overflow */ - if (bad_send_wr) - *bad_send_wr = cur_send_wr; ehca_err(my_qp->ib_qp.device, "Too many posted WQEs " "qp_num=%x", my_qp->ib_qp.qp_num); return -ENOMEM; @@ -433,8 +430,6 @@ static inline int post_one_send(struct ehca_qp *my_qp, */ if (unlikely(ret)) { my_qp->ipz_squeue.current_q_offset = start_offset; - if (bad_send_wr) - *bad_send_wr = cur_send_wr; ehca_err(my_qp->ib_qp.device, "Could not write WQE " "qp_num=%x", my_qp->ib_qp.qp_num); return -EINVAL; @@ -448,7 +443,6 @@ int ehca_post_send(struct ib_qp *qp, struct ib_send_wr **bad_send_wr) { struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); - struct ib_send_wr *cur_send_wr; int wqe_cnt = 0; int ret = 0; unsigned long flags; @@ -457,7 +451,8 @@ int ehca_post_send(struct ib_qp *qp, if (unlikely(my_qp->state < IB_QPS_RTS)) { ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", my_qp->state, qp->qp_num); - return -EINVAL; + ret = -EINVAL; + goto out; } /* LOCK the QUEUE */ @@ -476,24 +471,21 @@ int ehca_post_send(struct ib_qp *qp, struct ib_send_wr circ_wr; memset(&circ_wr, 0, sizeof(circ_wr)); circ_wr.opcode = IB_WR_RDMA_READ; - post_one_send(my_qp, &circ_wr, NULL, 1); /* ignore retcode */ + post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */ wqe_cnt++; ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num); my_qp->message_count = my_qp->packet_count = 0; } /* loop processes list of send reqs */ - for (cur_send_wr = send_wr; cur_send_wr != NULL; - cur_send_wr = cur_send_wr->next) { - ret = post_one_send(my_qp, cur_send_wr, bad_send_wr, 0); + while (send_wr) { + ret = post_one_send(my_qp, send_wr, 0); if (unlikely(ret)) { - /* if one or more WQEs were successful, don't fail */ - if (wqe_cnt) - ret = 0; goto post_send_exit0; } wqe_cnt++; - } /* eof for cur_send_wr */ + send_wr = send_wr->next; + } post_send_exit0: iosync(); /* serialize GAL register access */ @@ -503,6 +495,10 @@ post_send_exit0: my_qp, qp->qp_num, wqe_cnt, ret); my_qp->message_count += wqe_cnt; spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + +out: + if (ret) + *bad_send_wr = send_wr; return ret; } @@ -511,7 +507,6 @@ static int internal_post_recv(struct ehca_qp *my_qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { - struct ib_recv_wr *cur_recv_wr; struct ehca_wqe *wqe_p; int wqe_cnt = 0; int ret = 0; @@ -522,27 +517,23 @@ static int internal_post_recv(struct ehca_qp *my_qp, if (unlikely(!HAS_RQ(my_qp))) { ehca_err(dev, "QP has no RQ ehca_qp=%p qp_num=%x ext_type=%d", my_qp, my_qp->real_qp_num, my_qp->ext_type); - return -ENODEV; + ret = -ENODEV; + goto out; } /* LOCK the QUEUE */ spin_lock_irqsave(&my_qp->spinlock_r, flags); - /* loop processes list of send reqs */ - for (cur_recv_wr = recv_wr; cur_recv_wr != NULL; - cur_recv_wr = cur_recv_wr->next) { + /* loop processes list of recv reqs */ + while (recv_wr) { u64 start_offset = my_qp->ipz_rqueue.current_q_offset; /* get pointer next to free WQE */ wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue); if (unlikely(!wqe_p)) { /* too many posted work requests: queue overflow */ - if (bad_recv_wr) - *bad_recv_wr = cur_recv_wr; - if (wqe_cnt == 0) { - ret = -ENOMEM; - ehca_err(dev, "Too many posted WQEs " - "qp_num=%x", my_qp->real_qp_num); - } + ret = -ENOMEM; + ehca_err(dev, "Too many posted WQEs " + "qp_num=%x", my_qp->real_qp_num); goto post_recv_exit0; } /* @@ -552,7 +543,7 @@ static int internal_post_recv(struct ehca_qp *my_qp, rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size; /* write a RECV WQE into the QUEUE */ - ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, cur_recv_wr, + ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr, rq_map_idx); /* * if something failed, @@ -560,22 +551,20 @@ static int internal_post_recv(struct ehca_qp *my_qp, */ if (unlikely(ret)) { my_qp->ipz_rqueue.current_q_offset = start_offset; - *bad_recv_wr = cur_recv_wr; - if (wqe_cnt == 0) { - ret = -EINVAL; - ehca_err(dev, "Could not write WQE " - "qp_num=%x", my_qp->real_qp_num); - } + ret = -EINVAL; + ehca_err(dev, "Could not write WQE " + "qp_num=%x", my_qp->real_qp_num); goto post_recv_exit0; } qmap_entry = &my_qp->rq_map.map[rq_map_idx]; - qmap_entry->app_wr_id = get_app_wr_id(cur_recv_wr->wr_id); + qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id); qmap_entry->reported = 0; qmap_entry->cqe_req = 1; wqe_cnt++; - } /* eof for cur_recv_wr */ + recv_wr = recv_wr->next; + } /* eof for recv_wr */ post_recv_exit0: iosync(); /* serialize GAL register access */ @@ -584,6 +573,11 @@ post_recv_exit0: ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", my_qp, my_qp->real_qp_num, wqe_cnt, ret); spin_unlock_irqrestore(&my_qp->spinlock_r, flags); + +out: + if (ret) + *bad_recv_wr = recv_wr; + return ret; } @@ -597,6 +591,7 @@ int ehca_post_recv(struct ib_qp *qp, if (unlikely(my_qp->state == IB_QPS_RESET)) { ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", my_qp->state, qp->qp_num); + *bad_recv_wr = recv_wr; return -EINVAL; } -- cgit v1.1 From e293a26fe97c8598a96562c1c9376d9ae6cb96dd Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:54 -0800 Subject: RDMA/nes: Correct fast memory registration implementation Replace alloc_fmr, unmap_fmr, dealloc_fmr and map_phys_fmr with alloc_fast_reg_mr, alloc_fast_reg_page_list, free_fast_reg_page_list. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 5 +- drivers/infiniband/hw/nes/nes_hw.h | 27 +- drivers/infiniband/hw/nes/nes_user.h | 1 + drivers/infiniband/hw/nes/nes_verbs.c | 557 ++++++++++++++++------------------ 4 files changed, 299 insertions(+), 291 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 3512d6d..3d9bbff 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -424,8 +424,9 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { nesadapter->base_pd = 1; - nesadapter->device_cap_flags = - IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; + nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS; nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter) [(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]); diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index f28a41b..8a4c738 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -546,11 +546,23 @@ enum nes_iwarp_sq_fmr_wqe_word_idx { NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14, }; +enum nes_iwarp_sq_fmr_opcodes { + NES_IWARP_SQ_FMR_WQE_ZERO_BASED = (1<<6), + NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K = (0<<7), + NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M = (1<<7), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ = (1<<16), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE = (1<<17), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ = (1<<18), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE = (1<<19), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND = (1<<20), +}; + +#define NES_IWARP_SQ_FMR_WQE_MR_LENGTH_HIGH_MASK 0xFF; + enum nes_iwarp_sq_locinv_wqe_word_idx { NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6, }; - enum nes_iwarp_rq_wqe_word_idx { NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1, NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2, @@ -1153,6 +1165,19 @@ struct nes_pbl { /* TODO: need to add list for two level tables */ }; +#define NES_4K_PBL_CHUNK_SIZE 4096 + +struct nes_fast_mr_wqe_pbl { + u64 *kva; + dma_addr_t paddr; +}; + +struct nes_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + struct nes_fast_mr_wqe_pbl nes_wqe_pbl; + u64 pbl; +}; + struct nes_listener { struct work_struct work; struct workqueue_struct *wq; diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h index cc90c14..ce62f3c 100644 --- a/drivers/infiniband/hw/nes/nes_user.h +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -86,6 +86,7 @@ enum iwnes_memreg_type { IWNES_MEMREG_TYPE_CQ = 0x0002, IWNES_MEMREG_TYPE_MW = 0x0003, IWNES_MEMREG_TYPE_FMR = 0x0004, + IWNES_MEMREG_TYPE_FMEM = 0x0005, }; struct nes_mem_reg_req { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 499dd78..0a2b18b 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -275,342 +275,236 @@ static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, } -/** - * nes_alloc_fmr +/* + * nes_alloc_fast_mr */ -static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd, - int ibmr_access_flags, - struct ib_fmr_attr *ibfmr_attr) +static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, + u32 stag, u32 page_count) { - unsigned long flags; - struct nes_pd *nespd = to_nespd(ibpd); - struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_fmr *nesfmr; - struct nes_cqp_request *cqp_request; struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; int ret; - u32 stag; - u32 stag_index = 0; - u32 next_stag_index = 0; - u32 driver_key = 0; + struct nes_adapter *nesadapter = nesdev->nesadapter; u32 opcode = 0; - u8 stag_key = 0; - int i=0; - struct nes_vpbl vpbl; - - get_random_bytes(&next_stag_index, sizeof(next_stag_index)); - stag_key = (u8)next_stag_index; - - driver_key = 0; - - next_stag_index >>= 8; - next_stag_index %= nesadapter->max_mr; - - ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, - nesadapter->max_mr, &stag_index, &next_stag_index); - if (ret) { - goto failed_resource_alloc; - } - - nesfmr = kzalloc(sizeof(*nesfmr), GFP_KERNEL); - if (!nesfmr) { - ret = -ENOMEM; - goto failed_fmr_alloc; - } - - nesfmr->nesmr.mode = IWNES_MEMREG_TYPE_FMR; - if (ibfmr_attr->max_pages == 1) { - /* use zero length PBL */ - nesfmr->nesmr.pbl_4k = 0; - nesfmr->nesmr.pbls_used = 0; - } else if (ibfmr_attr->max_pages <= 32) { - /* use PBL 256 */ - nesfmr->nesmr.pbl_4k = 0; - nesfmr->nesmr.pbls_used = 1; - } else if (ibfmr_attr->max_pages <= 512) { - /* use 4K PBLs */ - nesfmr->nesmr.pbl_4k = 1; - nesfmr->nesmr.pbls_used = 1; - } else { - /* use two level 4K PBLs */ - /* add support for two level 256B PBLs */ - nesfmr->nesmr.pbl_4k = 1; - nesfmr->nesmr.pbls_used = 1 + (ibfmr_attr->max_pages >> 9) + - ((ibfmr_attr->max_pages & 511) ? 1 : 0); - } - /* Register the region with the adapter */ - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - - /* track PBL resources */ - if (nesfmr->nesmr.pbls_used != 0) { - if (nesfmr->nesmr.pbl_4k) { - if (nesfmr->nesmr.pbls_used > nesadapter->free_4kpbl) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_avail; - } else { - nesadapter->free_4kpbl -= nesfmr->nesmr.pbls_used; - } - } else { - if (nesfmr->nesmr.pbls_used > nesadapter->free_256pbl) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_avail; - } else { - nesadapter->free_256pbl -= nesfmr->nesmr.pbls_used; - } - } - } - - /* one level pbl */ - if (nesfmr->nesmr.pbls_used == 0) { - nesfmr->root_vpbl.pbl_vbase = NULL; - nes_debug(NES_DBG_MR, "zero level pbl \n"); - } else if (nesfmr->nesmr.pbls_used == 1) { - /* can change it to kmalloc & dma_map_single */ - nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &nesfmr->root_vpbl.pbl_pbase); - if (!nesfmr->root_vpbl.pbl_vbase) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_alloc; - } - nesfmr->leaf_pbl_cnt = 0; - nes_debug(NES_DBG_MR, "one level pbl, root_vpbl.pbl_vbase=%p \n", - nesfmr->root_vpbl.pbl_vbase); - } - /* two level pbl */ - else { - nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192, - &nesfmr->root_vpbl.pbl_pbase); - if (!nesfmr->root_vpbl.pbl_vbase) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_alloc; - } - - nesfmr->leaf_pbl_cnt = nesfmr->nesmr.pbls_used-1; - nesfmr->root_vpbl.leaf_vpbl = kzalloc(sizeof(*nesfmr->root_vpbl.leaf_vpbl)*1024, GFP_ATOMIC); - if (!nesfmr->root_vpbl.leaf_vpbl) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_leaf_vpbl_alloc; - } - - nes_debug(NES_DBG_MR, "two level pbl, root_vpbl.pbl_vbase=%p" - " leaf_pbl_cnt=%d root_vpbl.leaf_vpbl=%p\n", - nesfmr->root_vpbl.pbl_vbase, nesfmr->leaf_pbl_cnt, nesfmr->root_vpbl.leaf_vpbl); - - for (i=0; ileaf_pbl_cnt; i++) - nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase = NULL; - - for (i=0; ileaf_pbl_cnt; i++) { - vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &vpbl.pbl_pbase); - - if (!vpbl.pbl_vbase) { - ret = -ENOMEM; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - goto failed_leaf_vpbl_pages_alloc; - } - - nesfmr->root_vpbl.pbl_vbase[i].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase); - nesfmr->root_vpbl.pbl_vbase[i].pa_high = cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); - nesfmr->root_vpbl.leaf_vpbl[i] = vpbl; - - nes_debug(NES_DBG_MR, "pbase_low=0x%x, pbase_high=0x%x, vpbl=%p\n", - nesfmr->root_vpbl.pbl_vbase[i].pa_low, - nesfmr->root_vpbl.pbl_vbase[i].pa_high, - &nesfmr->root_vpbl.leaf_vpbl[i]); - } - } - nesfmr->ib_qp = NULL; - nesfmr->access_rights =0; + u16 major_code; + u64 region_length = page_count * PAGE_SIZE; - stag = stag_index << 8; - stag |= driver_key; - stag += (u32)stag_key; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); cqp_request = nes_get_cqp_request(nesdev); if (cqp_request == NULL) { nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); - ret = -ENOMEM; - goto failed_leaf_vpbl_pages_alloc; + return -ENOMEM; } + nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, " + "region_length = %llu\n", + page_count, region_length); cqp_request->waiting = 1; cqp_wqe = &cqp_request->cqp_wqe; - nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n", - stag, stag_index); - - opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR; - - if (nesfmr->nesmr.pbl_4k == 1) - opcode |= NES_CQP_STAG_PBL_BLK_SIZE; - - if (ibmr_access_flags & IB_ACCESS_REMOTE_WRITE) { - opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | - NES_CQP_STAG_RIGHTS_LOCAL_WRITE | NES_CQP_STAG_REM_ACC_EN; - nesfmr->access_rights |= - NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_RIGHTS_LOCAL_WRITE | - NES_CQP_STAG_REM_ACC_EN; + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nesadapter->free_4kpbl > 0) { + nesadapter->free_4kpbl--; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } else { + /* No 4kpbl's available: */ + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_debug(NES_DBG_MR, "Out of Pbls\n"); + nes_free_cqp_request(nesdev, cqp_request); + return -ENOMEM; } - if (ibmr_access_flags & IB_ACCESS_REMOTE_READ) { - opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | - NES_CQP_STAG_RIGHTS_LOCAL_READ | NES_CQP_STAG_REM_ACC_EN; - nesfmr->access_rights |= - NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_RIGHTS_LOCAL_READ | - NES_CQP_STAG_REM_ACC_EN; - } + opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR | + NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_REM_ACC_EN; + /* + * The current OFED API does not support the zero based TO option. + * If added then need to changed the NES_CQP_STAG_VA* option. Also, + * the API does not support that ability to have the MR set for local + * access only when created and not allow the SQ op to override. Given + * this the remote enable must be set here. + */ nes_fill_init_cqp_wqe(cqp_wqe, nesdev); set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1); - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = - cpu_to_le32((nesfmr->nesmr.pbls_used>1) ? - (nesfmr->nesmr.pbls_used-1) : nesfmr->nesmr.pbls_used); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = + cpu_to_le32((u32)(region_length >> 8) & 0xff000000); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= + cpu_to_le32(nespd->pd_id & 0x00007fff); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8)); + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); + barrier(); atomic_set(&cqp_request->refcount, 2); nes_post_cqp_request(nesdev, cqp_request); /* Wait for CQP */ - ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - stag, ret, cqp_request->major_code, cqp_request->minor_code); - - if ((!ret) || (cqp_request->major_code)) { - nes_put_cqp_request(nesdev, cqp_request); - ret = (!ret) ? -ETIME : -EIO; - goto failed_leaf_vpbl_pages_alloc; - } + ret = wait_event_timeout(cqp_request->waitq, + (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, " + "wait_event_timeout ret = %u, CQP Major:Minor codes = " + "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code, + cqp_request->minor_code); + major_code = cqp_request->major_code; nes_put_cqp_request(nesdev, cqp_request); - nesfmr->nesmr.ibfmr.lkey = stag; - nesfmr->nesmr.ibfmr.rkey = stag; - nesfmr->attr = *ibfmr_attr; - - return &nesfmr->nesmr.ibfmr; - - failed_leaf_vpbl_pages_alloc: - /* unroll all allocated pages */ - for (i=0; ileaf_pbl_cnt; i++) { - if (nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase) { - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase, - nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase); - } - } - if (nesfmr->root_vpbl.leaf_vpbl) - kfree(nesfmr->root_vpbl.leaf_vpbl); - - failed_leaf_vpbl_alloc: - if (nesfmr->leaf_pbl_cnt == 0) { - if (nesfmr->root_vpbl.pbl_vbase) - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); - } else - pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); - failed_vpbl_alloc: - if (nesfmr->nesmr.pbls_used != 0) { + if (!ret || major_code) { spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (nesfmr->nesmr.pbl_4k) - nesadapter->free_4kpbl += nesfmr->nesmr.pbls_used; - else - nesadapter->free_256pbl += nesfmr->nesmr.pbls_used; + nesadapter->free_4kpbl++; spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); } -failed_vpbl_avail: - kfree(nesfmr); - - failed_fmr_alloc: - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - - failed_resource_alloc: - return ERR_PTR(ret); + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + return 0; } - -/** - * nes_dealloc_fmr +/* + * nes_alloc_fast_reg_mr */ -static int nes_dealloc_fmr(struct ib_fmr *ibfmr) +struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len) { - unsigned long flags; - struct nes_mr *nesmr = to_nesmr_from_ibfmr(ibfmr); - struct nes_fmr *nesfmr = to_nesfmr(nesmr); - struct nes_vnic *nesvnic = to_nesvnic(ibfmr->device); + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; - int i = 0; - int rc; - /* free the resources */ - if (nesfmr->leaf_pbl_cnt == 0) { - /* single PBL case */ - if (nesfmr->root_vpbl.pbl_vbase) - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); - } else { - for (i = 0; i < nesfmr->leaf_pbl_cnt; i++) { - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase, - nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase); - } - kfree(nesfmr->root_vpbl.leaf_vpbl); - pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); - } - nesmr->ibmw.device = ibfmr->device; - nesmr->ibmw.pd = ibfmr->pd; - nesmr->ibmw.rkey = ibfmr->rkey; - nesmr->ibmw.uobject = NULL; + u32 next_stag_index; + u8 stag_key = 0; + u32 driver_key = 0; + int err = 0; + u32 stag_index = 0; + struct nes_mr *nesmr; + u32 stag; + int ret; + struct ib_mr *ibmr; +/* + * Note: Set to always use a fixed length single page entry PBL. This is to allow + * for the fast_reg_mr operation to always know the size of the PBL. + */ + if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); - rc = nes_dealloc_mw(&nesmr->ibmw); + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; - if ((rc == 0) && (nesfmr->nesmr.pbls_used != 0)) { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (nesfmr->nesmr.pbl_4k) { - nesadapter->free_4kpbl += nesfmr->nesmr.pbls_used; - WARN_ON(nesadapter->free_4kpbl > nesadapter->max_4kpbl); - } else { - nesadapter->free_256pbl += nesfmr->nesmr.pbls_used; - WARN_ON(nesadapter->free_256pbl > nesadapter->max_256pbl); - } - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, + &next_stag_index); + if (err) + return ERR_PTR(err); + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); } - return rc; -} + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n", + stag, stag_index); -/** - * nes_map_phys_fmr + ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_FMEM; + ibmr = &nesmr->ibmr; + } else { + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + } + return ibmr; +} + +/* + * nes_alloc_fast_reg_page_list */ -static int nes_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova) +static struct ib_fast_reg_page_list *nes_alloc_fast_reg_page_list( + struct ib_device *ibdev, + int page_list_len) { - return 0; -} + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct ib_fast_reg_page_list *pifrpl; + struct nes_ib_fast_reg_page_list *pnesfrpl; + if (page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); + /* + * Allocate the ib_fast_reg_page_list structure, the + * nes_fast_bpl structure, and the PLB table. + */ + pnesfrpl = kmalloc(sizeof(struct nes_ib_fast_reg_page_list) + + page_list_len * sizeof(u64), GFP_KERNEL); + + if (!pnesfrpl) + return ERR_PTR(-ENOMEM); -/** - * nes_unmap_frm + pifrpl = &pnesfrpl->ibfrpl; + pifrpl->page_list = &pnesfrpl->pbl; + pifrpl->max_page_list_len = page_list_len; + /* + * Allocate the WQE PBL + */ + pnesfrpl->nes_wqe_pbl.kva = pci_alloc_consistent(nesdev->pcidev, + page_list_len * sizeof(u64), + &pnesfrpl->nes_wqe_pbl.paddr); + + if (!pnesfrpl->nes_wqe_pbl.kva) { + kfree(pnesfrpl); + return ERR_PTR(-ENOMEM); + } + nes_debug(NES_DBG_MR, "nes_alloc_fast_reg_pbl: nes_frpl = %p, " + "ibfrpl = %p, ibfrpl.page_list = %p, pbl.kva = %p, " + "pbl.paddr= %p\n", pnesfrpl, &pnesfrpl->ibfrpl, + pnesfrpl->ibfrpl.page_list, pnesfrpl->nes_wqe_pbl.kva, + (void *)pnesfrpl->nes_wqe_pbl.paddr); + + return pifrpl; +} + +/* + * nes_free_fast_reg_page_list */ -static int nes_unmap_fmr(struct list_head *ibfmr_list) +static void nes_free_fast_reg_page_list(struct ib_fast_reg_page_list *pifrpl) { - return 0; + struct nes_vnic *nesvnic = to_nesvnic(pifrpl->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_ib_fast_reg_page_list *pnesfrpl; + + pnesfrpl = container_of(pifrpl, struct nes_ib_fast_reg_page_list, ibfrpl); + /* + * Free the WQE PBL. + */ + pci_free_consistent(nesdev->pcidev, + pifrpl->max_page_list_len * sizeof(u64), + pnesfrpl->nes_wqe_pbl.kva, + pnesfrpl->nes_wqe_pbl.paddr); + /* + * Free the PBL structure + */ + kfree(pnesfrpl); } - - /** * nes_query_device */ @@ -3514,6 +3408,91 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX, ib_wr->ex.invalidate_rkey); break; + case IB_WR_FAST_REG_MR: + { + int i; + int flags = ib_wr->wr.fast_reg.access_flags; + struct nes_ib_fast_reg_page_list *pnesfrpl = + container_of(ib_wr->wr.fast_reg.page_list, + struct nes_ib_fast_reg_page_list, + ibfrpl); + u64 *src_page_list = pnesfrpl->ibfrpl.page_list; + u64 *dst_page_list = pnesfrpl->nes_wqe_pbl.kva; + + if (ib_wr->wr.fast_reg.page_list_len > + (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) { + nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n"); + err = -EINVAL; + break; + } + wqe_misc = NES_IWARP_SQ_OP_FAST_REG; + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX, + ib_wr->wr.fast_reg.iova_start); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, + ib_wr->wr.fast_reg.length); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX, + ib_wr->wr.fast_reg.rkey); + /* Set page size: */ + if (ib_wr->wr.fast_reg.page_shift == 12) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K; + } else if (ib_wr->wr.fast_reg.page_shift == 21) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M; + } else { + nes_debug(NES_DBG_IW_TX, "Invalid page shift," + " ib_wr=%u, max=1\n", ib_wr->num_sge); + err = -EINVAL; + break; + } + /* Set access_flags */ + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ; + if (flags & IB_ACCESS_LOCAL_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE; + + if (flags & IB_ACCESS_REMOTE_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE; + + if (flags & IB_ACCESS_REMOTE_READ) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ; + + if (flags & IB_ACCESS_MW_BIND) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND; + + /* Fill in PBL info: */ + if (ib_wr->wr.fast_reg.page_list_len > + pnesfrpl->ibfrpl.max_page_list_len) { + nes_debug(NES_DBG_IW_TX, "Invalid page list length," + " ib_wr=%p, value=%u, max=%u\n", + ib_wr, ib_wr->wr.fast_reg.page_list_len, + pnesfrpl->ibfrpl.max_page_list_len); + err = -EINVAL; + break; + } + + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX, + pnesfrpl->nes_wqe_pbl.paddr); + + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX, + ib_wr->wr.fast_reg.page_list_len * 8); + + for (i = 0; i < ib_wr->wr.fast_reg.page_list_len; i++) + dst_page_list[i] = cpu_to_le64(src_page_list[i]); + + nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %p, " + "length: %d, rkey: %0x, pgl_paddr: %p, " + "page_list_len: %u, wqe_misc: %x\n", + (void *)ib_wr->wr.fast_reg.iova_start, + ib_wr->wr.fast_reg.length, + ib_wr->wr.fast_reg.rkey, + (void *)pnesfrpl->nes_wqe_pbl.paddr, + ib_wr->wr.fast_reg.page_list_len, + wqe_misc); + break; + } default: /* error */ err = -EINVAL; @@ -3752,6 +3731,9 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) case NES_IWARP_SQ_OP_LOCINV: entry->opcode = IB_WR_LOCAL_INV; break; + case NES_IWARP_SQ_OP_FAST_REG: + entry->opcode = IB_WC_FAST_REG_MR; + break; } nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); @@ -3922,10 +3904,9 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.dealloc_mw = nes_dealloc_mw; nesibdev->ibdev.bind_mw = nes_bind_mw; - nesibdev->ibdev.alloc_fmr = nes_alloc_fmr; - nesibdev->ibdev.unmap_fmr = nes_unmap_fmr; - nesibdev->ibdev.dealloc_fmr = nes_dealloc_fmr; - nesibdev->ibdev.map_phys_fmr = nes_map_phys_fmr; + nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr; + nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list; + nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list; nesibdev->ibdev.attach_mcast = nes_multicast_attach; nesibdev->ibdev.detach_mcast = nes_multicast_detach; -- cgit v1.1 From a276510328d0721c252b37044c51e2fb4efe0364 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: RDMA/nes: Add additional SFP+ PHY uC status check and PHY reset Add additional PHY uC status check in case PHY firmware is not running properly with heartbeat. Add a hard PHY reset if uC status is 0x0 after initial reset. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 3d9bbff..b59ca56 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1356,6 +1356,8 @@ int nes_init_phy(struct nes_device *nesdev) } if ((phy_type == NES_PHY_TYPE_ARGUS) || (phy_type == NES_PHY_TYPE_SFP_D)) { + u32 first_time = 1; + /* Check firmware heartbeat */ nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); @@ -1363,8 +1365,13 @@ int nes_init_phy(struct nes_device *nesdev) nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - if (temp_phy_data != temp_phy_data2) - return 0; + if (temp_phy_data != temp_phy_data2) { + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if ((temp_phy_data & 0xff) > 0x20) + return 0; + printk(PFX "Reinitializing PHY\n"); + } /* no heartbeat, configure the PHY */ nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000); @@ -1400,7 +1407,7 @@ int nes_init_phy(struct nes_device *nesdev) temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); do { if (counter++ > 150) { - nes_debug(NES_DBG_PHY, "No PHY heartbeat\n"); + printk(PFX "No PHY heartbeat\n"); break; } mdelay(1); @@ -1414,11 +1421,20 @@ int nes_init_phy(struct nes_device *nesdev) nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); if (counter++ > 300) { - nes_debug(NES_DBG_PHY, "PHY did not track\n"); - break; + if (((temp_phy_data & 0xff) == 0x0) && first_time) { + first_time = 0; + counter = 0; + /* reset AMCC PHY and try again */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040); + continue; + } else { + printk(PFX "PHY did not track\n"); + break; + } } mdelay(10); - } while (((temp_phy_data & 0xff) != 0x50) && ((temp_phy_data & 0xff) != 0x70)); + } while ((temp_phy_data & 0xff) < 0x30); /* setup signal integrity */ nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000); -- cgit v1.1 From d14152da13dc29aa70cddd8ca214a13e3597eb7f Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: RDMA/nes: Implement IB_SIGNAL_ALL_WR as an iWARP extension Add IB_SINGAL_ALL_WR support as an iWARP extension. If set, make sure all WR for the QP are signalled. Consolidate flags used in nesqp structure. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 6 ++++-- drivers/infiniband/hw/nes/nes_verbs.h | 15 +++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 0a2b18b..8ea7561 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1407,6 +1407,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, return ERR_PTR(-EINVAL); } + nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); + /* update the QP table */ nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", @@ -3502,9 +3504,9 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, if (err) break; - if (ib_wr->send_flags & IB_SEND_SIGNALED) { + if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all) wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; - } + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc); ib_wr = ib_wr->next; diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 89822d7..ac8b86b 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -167,17 +167,20 @@ struct nes_qp { enum ib_event_type terminate_eventtype; wait_queue_head_t kick_waitq; u16 in_disconnect; + u16 active_conn:1; + u16 skip_lsmm:1; + u16 user_mode:1; + u16 hte_added:1; + u16 flush_issued:1; + u16 destroyed:1; + u16 sq_kmapped:1; + u16 sig_all:1; + u16 rsvd:8; u16 private_data_len; u16 term_sq_flush_code; u16 term_rq_flush_code; - u8 active_conn; - u8 skip_lsmm; - u8 user_mode; - u8 hte_added; u8 hw_iwarp_state; - u8 flush_issued; u8 hw_tcp_state; u8 term_flags; - u8 destroyed; }; #endif /* NES_VERBS_H */ -- cgit v1.1 From 75742c630ed552ad963948c9f3e596e96eed7a9f Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: RDMA/nes: Clean up struct nes_qp Remove unused and not really used variables. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 6 ------ drivers/infiniband/hw/nes/nes_verbs.h | 9 +-------- 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 8ea7561..ea4e22e 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1398,8 +1398,6 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n", nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp)); spin_lock_init(&nesqp->lock); - init_waitqueue_head(&nesqp->state_waitq); - init_waitqueue_head(&nesqp->kick_waitq); nes_add_ref(&nesqp->ibqp); break; default: @@ -3005,7 +3003,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, " already done based on hw state.\n", nesqp->hwqp.qp_id); issue_modify_qp = 0; - nesqp->in_disconnect = 0; } switch (nesqp->hw_iwarp_state) { case NES_AEQE_IWARP_STATE_CLOSING: @@ -3018,7 +3015,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, break; default: next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; - nesqp->in_disconnect = 1; nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; break; } @@ -3035,7 +3031,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; issue_modify_qp = 1; - nesqp->in_disconnect = 1; break; case IB_QPS_ERR: case IB_QPS_RESET: @@ -3058,7 +3053,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) && (nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) { next_iwarp_state |= NES_CQP_QP_RESET; - nesqp->in_disconnect = 1; } else { nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n", nesqp->hwqp.qp_id, nesqp->hw_tcp_state); diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index ac8b86b..795aa4f 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -135,19 +135,15 @@ struct nes_qp { struct ib_qp ibqp; void *allocated_buffer; struct iw_cm_id *cm_id; - struct workqueue_struct *wq; struct nes_cq *nesscq; struct nes_cq *nesrcq; struct nes_pd *nespd; void *cm_node; /* handle of the node this QP is associated with */ struct ietf_mpa_frame *ietf_frame; dma_addr_t ietf_frame_pbase; - wait_queue_head_t state_waitq; struct ib_mr *lsmm_mr; - unsigned long socket; struct nes_hw_qp hwqp; struct work_struct work; - struct work_struct ae_work; enum ib_qp_state ibqp_state; u32 iwarp_state; u32 hte_index; @@ -165,17 +161,14 @@ struct nes_qp { struct page *page; struct timer_list terminate_timer; enum ib_event_type terminate_eventtype; - wait_queue_head_t kick_waitq; - u16 in_disconnect; u16 active_conn:1; u16 skip_lsmm:1; u16 user_mode:1; u16 hte_added:1; u16 flush_issued:1; u16 destroyed:1; - u16 sq_kmapped:1; u16 sig_all:1; - u16 rsvd:8; + u16 rsvd:9; u16 private_data_len; u16 term_sq_flush_code; u16 term_rq_flush_code; -- cgit v1.1 From 5924aea6e26712cd372aa23ed432d4cefbb050d2 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: RDMA/nes: Add max_cqe check to nes_create_cq() Add a check to nes_create_cq() to return -EINVAL if creating a CQ with depth > max_cqe (32766). Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index b59ca56..6f625a9 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -482,7 +482,7 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { nesadapter->max_irrq_wr = (u32temp >> 16) & 3; nesadapter->max_sge = 4; - nesadapter->max_cqe = 32767; + nesadapter->max_cqe = 32766; if (nes_read_eeprom_values(nesdev, nesadapter)) { printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index ea4e22e..155286b 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -527,7 +527,7 @@ static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *prop props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2; props->max_sge = nesdev->nesadapter->max_sge; props->max_cq = nesibdev->max_cq; - props->max_cqe = nesdev->nesadapter->max_cqe - 1; + props->max_cqe = nesdev->nesadapter->max_cqe; props->max_mr = nesibdev->max_mr; props->max_mw = nesibdev->max_mr; props->max_pd = nesibdev->max_pd; @@ -1543,6 +1543,9 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, unsigned long flags; int ret; + if (entries > nesadapter->max_cqe) + return ERR_PTR(-EINVAL); + err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, nesadapter->max_cq, &cq_num, &nesadapter->next_cq); if (err) { -- cgit v1.1 From fa6c87d5104512bf73cf62162cec9ef6eba707c7 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: RDMA/nes: Update copyright and branding string Update copyright from Intel-NE, Inc. to Intel Corporation. Use proper branding string in Kconfig and simplify description. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/Kconfig | 9 ++++----- drivers/infiniband/hw/nes/nes.c | 2 +- drivers/infiniband/hw/nes/nes.h | 2 +- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/infiniband/hw/nes/nes_cm.h | 2 +- drivers/infiniband/hw/nes/nes_context.h | 2 +- drivers/infiniband/hw/nes/nes_hw.c | 2 +- drivers/infiniband/hw/nes/nes_hw.h | 2 +- drivers/infiniband/hw/nes/nes_nic.c | 2 +- drivers/infiniband/hw/nes/nes_user.h | 2 +- drivers/infiniband/hw/nes/nes_utils.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.h | 2 +- 13 files changed, 16 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig index d449eb6..846dc97 100644 --- a/drivers/infiniband/hw/nes/Kconfig +++ b/drivers/infiniband/hw/nes/Kconfig @@ -4,14 +4,13 @@ config INFINIBAND_NES select LIBCRC32C select INET_LRO ---help--- - This is a low-level driver for NetEffect RDMA enabled - Network Interface Cards (RNIC). + This is the RDMA Network Interface Card (RNIC) driver for + NetEffect Ethernet Cluster Server Adapters. config INFINIBAND_NES_DEBUG bool "Verbose debugging output" depends on INFINIBAND_NES default n ---help--- - This option causes the NetEffect RNIC driver to produce debug - messages. Select this if you are developing the driver - or trying to diagnose a problem. + This option enables debug messages from the NetEffect RNIC + driver. Select this if you are diagnosing a problem. diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index cbde0cf..88d3114 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index bcc6abc..9884056 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 73473db..dbe5455 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 90e8e4d..5e4808c 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h index 0fb8d81..b4393a1 100644 --- a/drivers/infiniband/hw/nes/nes_context.h +++ b/drivers/infiniband/hw/nes/nes_context.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 6f625a9..9fc0273 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index 8a4c738..084be0e 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -1,5 +1,5 @@ /* -* Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. +* Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index e593af3..5a7b554 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h index ce62f3c..71e133a 100644 --- a/drivers/infiniband/hw/nes/nes_user.h +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Cisco Systems. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 9687c39..729d525 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 155286b..0868652 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 795aa4f..cc7a604 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two -- cgit v1.1 From d85ddd835b33a9a0f2276ce068318da3fd1ad76a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 9 Dec 2009 15:21:57 -0800 Subject: RDMA/nes: Pass correct size to ioremap_nocache() The size argument to ioremap_nocache should be the size of desired information, not the pointer to it. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @expression@ expression *x; @@ x = <+... *sizeof(x) ...+>// Signed-off-by: Julia Lawall Acked-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 88d3114..b9d09ba 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -521,7 +521,8 @@ static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_i spin_lock_init(&nesdev->indexed_regs_lock); /* Remap the PCI registers in adapter BAR0 to kernel VA space */ - mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), sizeof(mmio_regs)); + mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), + pci_resource_len(pcidev, BAR_0)); if (mmio_regs == NULL) { printk(KERN_ERR PFX "Unable to remap BAR0\n"); ret = -EIO; -- cgit v1.1 From 9b84dbe7f479a5a5fa53d689c2adf214ce7760e5 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:53:36 -0800 Subject: RDMA/nes: Fix MAX_CM_BUFFER define Change MAX_CM_BUFFER for MPA frames to be conformant to RFC 5044: we need 512 + 20 instead of 512. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 5e4808c..911846a 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -47,6 +47,8 @@ #define IEFT_MPA_KEY_REP "MPA ID Rep Frame" #define IETF_MPA_KEY_SIZE 16 #define IETF_MPA_VERSION 1 +#define IETF_MAX_PRIV_DATA_LEN 512 +#define IETF_MPA_FRAME_SIZE 20 enum ietf_mpa_flags { IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ @@ -169,7 +171,7 @@ struct nes_timer_entry { #define NES_CM_DEF_SEQ2 0x18ed5740 #define NES_CM_DEF_LOCAL_ID2 0xb807 -#define MAX_CM_BUFFER 512 +#define MAX_CM_BUFFER (IETF_MPA_FRAME_SIZE + IETF_MAX_PRIV_DATA_LEN) typedef u32 nes_addr_t; -- cgit v1.1 From 8ac7f6e1af5309d4fdf6805fb64ef48c1c820d85 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:53:46 -0800 Subject: RDMA/nes: Fix query of ORD values The ORD size needs updating as we are supporting more inbound READ resources per connection. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 0868652..67a87cb 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -534,16 +534,16 @@ static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *prop props->max_sge_rd = 1; switch (nesdev->nesadapter->max_irrq_wr) { case 0: - props->max_qp_rd_atom = 1; + props->max_qp_rd_atom = 2; break; case 1: - props->max_qp_rd_atom = 4; + props->max_qp_rd_atom = 8; break; case 2: - props->max_qp_rd_atom = 16; + props->max_qp_rd_atom = 32; break; case 3: - props->max_qp_rd_atom = 32; + props->max_qp_rd_atom = 64; break; default: props->max_qp_rd_atom = 0; -- cgit v1.1 From 1cf078c9951b531bc222a5195306a3a927c24fc9 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:53:54 -0800 Subject: RDMA/nes: MPA request/response error checking During Xansation testing, we saw that error handling of MPA frame msg/response is not handled properly. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index dbe5455..ae09463 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -251,6 +251,33 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, mpa_frame = (struct ietf_mpa_frame *)buffer; cm_node->mpa_frame_size = ntohs(mpa_frame->priv_data_len); + /* make sure mpa private data len is less than 512 bytes */ + if (cm_node->mpa_frame_size > IETF_MAX_PRIV_DATA_LEN) { + nes_debug(NES_DBG_CM, "The received Length of Private" + " Data field exceeds 512 octets\n"); + return -EINVAL; + } + /* + * make sure MPA receiver interoperate with the + * received MPA version and MPA key information + * + */ + if (mpa_frame->rev != mpa_version) { + nes_debug(NES_DBG_CM, "The received mpa version" + " can not be interoperated\n"); + return -EINVAL; + } + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } else { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } if (cm_node->mpa_frame_size + sizeof(struct ietf_mpa_frame) != len) { nes_debug(NES_DBG_CM, "The received ietf buffer was not right" @@ -1974,7 +2001,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (!cm_node) return NULL; mpa_frame = &cm_node->mpa_frame; - strcpy(mpa_frame->key, IEFT_MPA_KEY_REQ); + memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE); mpa_frame->flags = IETF_MPA_FLAGS_CRC; mpa_frame->rev = IETF_MPA_VERSION; mpa_frame->priv_data_len = htons(private_data_len); @@ -2929,7 +2956,7 @@ int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) if (cm_node->mpa_frame_size > MAX_CM_BUFFER) return -EINVAL; - strcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP); + memcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); if (loopback) { memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len); loopback->mpa_frame.priv_data_len = pdata_len; -- cgit v1.1 From 69524e1aff75e4ed8efcb7d699c97d55c317a950 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:03 -0800 Subject: RDMA/nes: Resource not freed for REJECTed connections During testing of REJECT connection error handling, we saw that the cm_id resources are not released. When the retransmit timer expires, we need to send a reset message to remote node before issuing the ABORTED event. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index ae09463..08fcd25 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -513,6 +513,8 @@ static void nes_retrans_expired(struct nes_cm_node *cm_node) send_reset(cm_node, NULL); break; default: + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); create_event(cm_node, NES_CM_EVENT_ABORTED); } } -- cgit v1.1 From c5a7d4897156667a58fd8479f6227143573fe82d Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:08 -0800 Subject: RDMA/nes: Fix crash in nes_accept() While running IMP_EXT's window test, we saw a crash in nes_accept(). Here is the sequence of what happened: (1) In MVAPICH2, connect request is received for port #0. FIX: Add a nes_connect() check to make sure local or remote tcp port is not 0. (2) Remote node's (passive) TCP stack sends a reset when it gets a connect request because of port = 0. Active side set the connect error to IW_CM_EVENT_STATUS_REJECTED when it received the RST from remote node. FIX: The corect error code is -ECONNRESET. (3) Wrong error code of IW_CM_EVENT_STATUS_REJECTED causes the core to destroy its listener ports. Here there are connections that may have sent an MPA request up and waiting for accept or reject. But the listener and its cm_nodes have been freed already causing the crash noticed. FIX: The cm_node is freed only if its state is not NES_CM_STATE_MPAREQ_RCVD. If cm_node's state is NES_CM_STATE_MPAREQ_RCVD then its new state is set to NES_CM_STATE_LISTENER_DESTROYED and it is not freed. When nes_accept() or nes_reject() is received, its state is checked for NES_CM_STATE_LISTENER_DESTROYED and in this case the cm_node is freed and error is returned. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 77 ++++++++++++++++++++++++-------------- drivers/infiniband/hw/nes/nes_cm.h | 1 + 2 files changed, 50 insertions(+), 28 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 08fcd25..ec04786 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -978,6 +978,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, reset_entry); { struct nes_cm_node *loopback = cm_node->loopbackpartner; + enum nes_cm_node_state old_state; if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) { rem_ref_cm_node(cm_node->cm_core, cm_node); } else { @@ -989,11 +990,12 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, NES_CM_STATE_CLOSED; WARN_ON(1); } else { - cm_node->state = - NES_CM_STATE_CLOSED; - rem_ref_cm_node( - cm_node->cm_core, - cm_node); + old_state = cm_node->state; + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; + if (old_state != NES_CM_STATE_MPAREQ_RCVD) + rem_ref_cm_node( + cm_node->cm_core, + cm_node); } } else { struct nes_cm_event event; @@ -1009,6 +1011,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, loopback->loc_port; event.cm_info.cm_id = loopback->cm_id; cm_event_connect_error(&event); + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; loopback->state = NES_CM_STATE_CLOSED; event.cm_node = cm_node; @@ -2131,30 +2134,39 @@ static int mini_cm_reject(struct nes_cm_core *cm_core, cm_node->state = NES_CM_STATE_CLOSED; rem_ref_cm_node(cm_core, cm_node); } else { - ret = send_mpa_reject(cm_node); - if (ret) { - cm_node->state = NES_CM_STATE_CLOSED; - err = send_reset(cm_node, NULL); - if (err) - WARN_ON(1); - } else - cm_id->add_ref(cm_id); + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + } else { + ret = send_mpa_reject(cm_node); + if (ret) { + cm_node->state = NES_CM_STATE_CLOSED; + err = send_reset(cm_node, NULL); + if (err) + WARN_ON(1); + } else + cm_id->add_ref(cm_id); + } } } else { cm_node->cm_id = NULL; - event.cm_node = loopback; - event.cm_info.rem_addr = loopback->rem_addr; - event.cm_info.loc_addr = loopback->loc_addr; - event.cm_info.rem_port = loopback->rem_port; - event.cm_info.loc_port = loopback->loc_port; - event.cm_info.cm_id = loopback->cm_id; - cm_event_mpa_reject(&event); - rem_ref_cm_node(cm_core, cm_node); - loopback->state = NES_CM_STATE_CLOSING; + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + rem_ref_cm_node(cm_core, loopback); + } else { + event.cm_node = loopback; + event.cm_info.rem_addr = loopback->rem_addr; + event.cm_info.loc_addr = loopback->loc_addr; + event.cm_info.rem_port = loopback->rem_port; + event.cm_info.loc_port = loopback->loc_port; + event.cm_info.cm_id = loopback->cm_id; + cm_event_mpa_reject(&event); + rem_ref_cm_node(cm_core, cm_node); + loopback->state = NES_CM_STATE_CLOSING; - cm_id = loopback->cm_id; - rem_ref_cm_node(cm_core, loopback); - cm_id->rem_ref(cm_id); + cm_id = loopback->cm_id; + rem_ref_cm_node(cm_core, loopback); + cm_id->rem_ref(cm_id); + } } return ret; @@ -2198,6 +2210,7 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod case NES_CM_STATE_UNKNOWN: case NES_CM_STATE_INITED: case NES_CM_STATE_CLOSED: + case NES_CM_STATE_LISTENER_DESTROYED: ret = rem_ref_cm_node(cm_core, cm_node); break; case NES_CM_STATE_TSA: @@ -2716,8 +2729,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct nes_pd *nespd; u64 tagged_offset; - - ibqp = nes_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) return -EINVAL; @@ -2733,6 +2744,13 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) "%s\n", cm_node, nesvnic, nesvnic->netdev, nesvnic->netdev->name); + if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) { + if (cm_node->loopbackpartner) + rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner); + rem_ref_cm_node(cm_node->cm_core, cm_node); + return -EINVAL; + } + /* associate the node with the QP */ nesqp->cm_node = (void *)cm_node; cm_node->nesqp = nesqp; @@ -3003,6 +3021,9 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!nesdev) return -EINVAL; + if (!(cm_id->local_addr.sin_port) || !(cm_id->remote_addr.sin_port)) + return -EINVAL; + nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = " "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, ntohl(nesvnic->local_ipaddr), @@ -3375,7 +3396,7 @@ static void cm_event_connect_error(struct nes_cm_event *event) nesqp->cm_id = NULL; cm_id->provider_data = NULL; cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = IW_CM_EVENT_STATUS_REJECTED; + cm_event.status = -ECONNRESET; cm_event.provider_data = cm_id->provider_data; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 911846a..d9825fd 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -200,6 +200,7 @@ enum nes_cm_node_state { NES_CM_STATE_TIME_WAIT, NES_CM_STATE_LAST_ACK, NES_CM_STATE_CLOSING, + NES_CM_STATE_LISTENER_DESTROYED, NES_CM_STATE_CLOSED }; -- cgit v1.1 From f9f3f1e08b4d66bfda2a0c2d49a26c80489a0725 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:14 -0800 Subject: RDMA/nes: Abnormal listener exit causes loopback node crash When the listener is destroyed for a loopback connection, the listener node gets a reset event. This causes a crash as the listener is not expecting a reset event. Code review of cm_event_reset() during debugging showed the cm_id ref count is incremented after calling its event handler and not before. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index ec04786..20e21f1 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1014,18 +1014,6 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; loopback->state = NES_CM_STATE_CLOSED; - event.cm_node = cm_node; - event.cm_info.rem_addr = - cm_node->rem_addr; - event.cm_info.loc_addr = - cm_node->loc_addr; - event.cm_info.rem_port = - cm_node->rem_port; - event.cm_info.loc_port = - cm_node->loc_port; - event.cm_info.cm_id = cm_node->cm_id; - cm_event_reset(&event); - rem_ref_cm_node(cm_node->cm_core, cm_node); @@ -3440,6 +3428,8 @@ static void cm_event_reset(struct nes_cm_event *event) nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id); nesqp = cm_id->provider_data; + if (!nesqp) + return; nesqp->cm_id = NULL; /* cm_id->provider_data = NULL; */ @@ -3451,8 +3441,8 @@ static void cm_event_reset(struct nes_cm_event *event) cm_event.private_data = NULL; cm_event.private_data_len = 0; - ret = cm_id->event_handler(cm_id, &cm_event); cm_id->add_ref(cm_id); + ret = cm_id->event_handler(cm_id, &cm_event); atomic_inc(&cm_closes); cm_event.event = IW_CM_EVENT_CLOSE; cm_event.status = IW_CM_EVENT_STATUS_OK; -- cgit v1.1 From 886f98a31586fd560fe83c44ad72e3ebe62f8e2e Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:18 -0800 Subject: RDMA/nes: Fix Xansation test crash on cm_node ref_count While running a Xansation test, an active side node crashed. The problem started on the passive side, which generated an STtag that was 0. The passive side sent a TERMINATE instead of an MPA REJECT msg. The active side, receives TERMINATE and sends connect_err() and set the cm_node state to CLOSED. The passive side sends FIN + ACK after TERMINATE. Active side ends up in handle_ack_pkt() and send_reset(). send_reset() consumes 1 cm_node's ref_count. Because the cm_node is in CLOSED state, which means that cm_node will be destroyed after completion of the connect_err() indication, CM will crash after send_reset(). Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 20e21f1..a258168 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1610,6 +1610,7 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, break; case NES_CM_STATE_CLOSED: cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); send_reset(cm_node, skb); break; case NES_CM_STATE_TSA: @@ -1661,9 +1662,15 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, passive_open_err(cm_node, skb, 1); break; case NES_CM_STATE_LISTENING: + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; case NES_CM_STATE_CLOSED: cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); send_reset(cm_node, skb); break; case NES_CM_STATE_ESTABLISHED: @@ -1732,8 +1739,13 @@ static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, dev_kfree_skb_any(skb); break; case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; case NES_CM_STATE_CLOSED: cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); send_reset(cm_node, skb); break; case NES_CM_STATE_LAST_ACK: @@ -2193,8 +2205,11 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod case NES_CM_STATE_CLOSING: ret = -1; break; - case NES_CM_STATE_MPAREJ_RCVD: case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_MPAREJ_RCVD: case NES_CM_STATE_UNKNOWN: case NES_CM_STATE_INITED: case NES_CM_STATE_CLOSED: -- cgit v1.1 From fd000e12a564bdeaec5e5a438d341d9132409f26 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:23 -0800 Subject: RDMA/nes: Check for zero STag STags are generated randomly but the driver does not correctly prevent a zero STag. Using STag zero is privileged and causes a user space application to fail. This change prevents the driver from trying to allocate a zero STag. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 3 ++- drivers/infiniband/hw/nes/nes_verbs.c | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 9fc0273..b1c2cbb 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -437,11 +437,12 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]); - /* mark the usual suspect QPs and CQs as in use */ + /* mark the usual suspect QPs, MR and CQs as in use */ for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) { set_bit(u32temp, nesadapter->allocated_qps); set_bit(u32temp, nesadapter->allocated_cqs); } + set_bit(0, nesadapter->allocated_mrs); for (u32temp = 0; u32temp < 20; u32temp++) set_bit(u32temp, nesadapter->allocated_pds); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 67a87cb..53dc39f 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2503,9 +2503,6 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, stag = stag_index << 8; stag |= driver_key; stag += (u32)stag_key; - if (stag == 0) { - stag = 1; - } iova_start = virt; /* Make the leaf PBL the root if only one PBL */ -- cgit v1.1 From d2fa9b26e181d1e3c3df06a57fa13b04afee0e16 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:28 -0800 Subject: RDMA/nes: Free kmap() resources We fail when creating many qps as kmap() fails for sq_vbase. Fix this by doing kunmap() as soon as we are done with sq_vbase. We do kunmap() in one of the locations below: (1) nes_destroy_qp() (2) nes_accept() (3) nes_connect_event We keep a flag to avoid multiple calls to kunmap(). Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 10 ++++++++++ drivers/infiniband/hw/nes/nes_verbs.c | 10 ++++++++-- drivers/infiniband/hw/nes/nes_verbs.h | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index a258168..b139806 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -2836,6 +2837,10 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cpu_to_le32(conn_param->private_data_len + sizeof(struct ietf_mpa_frame)); wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | @@ -3304,6 +3309,11 @@ static void cm_event_connected(struct nes_cm_event *event) wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + /* use the reserved spot on the WQ for the extra first WQE */ nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 53dc39f..64d3136 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1015,6 +1015,7 @@ static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl, kunmap(nesqp->page); return -ENOMEM; } + nesqp->sq_kmapped = 1; nesqp->hwqp.q2_vbase = mem; mem += 256; memset(nesqp->hwqp.q2_vbase, 0, 256); @@ -1092,7 +1093,10 @@ static inline void nes_free_qp_mem(struct nes_device *nesdev, pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase ); nesqp->pbl_vbase = NULL; - kunmap(nesqp->page); + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } } } @@ -1501,8 +1505,10 @@ static int nes_destroy_qp(struct ib_qp *ibqp) nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index; } } - if (nesqp->pbl_pbase) + if (nesqp->pbl_pbase && nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; kunmap(nesqp->page); + } } else { /* Clean any pending completions from the cq(s) */ if (nesqp->nesscq) diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index cc7a604..2df9993e 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -175,5 +175,6 @@ struct nes_qp { u8 hw_iwarp_state; u8 hw_tcp_state; u8 term_flags; + u8 sq_kmapped; }; #endif /* NES_VERBS_H */ -- cgit v1.1 From b1190d3e0d548615ee7c38c10b5fc376a76b7afd Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:32 -0800 Subject: RDMA/nes: FIN during MPA startup causes timeout A FIN that is received during an MPA start up sequence causes a timeout in iwcm.c. The connection has not been completely closed so the iwcm code is waiting for resources to be cleaned up. This closes the connection so everything cleans up correctly. Signed-off-by: Don Wood Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index b139806..4acf04a 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1354,13 +1354,20 @@ static void handle_fin_pkt(struct nes_cm_node *cm_node) case NES_CM_STATE_SYN_RCVD: case NES_CM_STATE_SYN_SENT: case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_MPAREQ_SENT: case NES_CM_STATE_MPAREJ_RCVD: cm_node->tcp_cntxt.rcv_nxt++; cleanup_retrans_entry(cm_node); cm_node->state = NES_CM_STATE_LAST_ACK; send_fin(cm_node, NULL); break; + case NES_CM_STATE_MPAREQ_SENT: + create_event(cm_node, NES_CM_EVENT_ABORTED); + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); + break; case NES_CM_STATE_FIN_WAIT1: cm_node->tcp_cntxt.rcv_nxt++; cleanup_retrans_entry(cm_node); -- cgit v1.1 From 7a576dfd9ed4fd0f32bb838ce4f644af201ac7df Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:33 -0800 Subject: RDMA/nes: Fix stale ARP issue When the remote node's ethernet address changes, the connection keeps trying to connect using the old address. The connection wil continue failing until the driver is unloaded and loaded again (eiter reboot or rmmod). Fix this by checking that the NIC has the correct address before starting a connection. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 4acf04a..39468c27 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1098,12 +1098,13 @@ static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, /** * nes_addr_resolve_neigh */ -static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip) +static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex) { struct rtable *rt; struct flowi fl; struct neighbour *neigh; - int rc = -1; + int rc = arpindex; + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; memset(&fl, 0, sizeof fl); fl.nl_u.ip4_u.daddr = htonl(dst_ip); @@ -1119,6 +1120,21 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip) nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X" " is %pM, Gateway is 0x%08X \n", dst_ip, neigh->ha, ntohl(rt->rt_gateway)); + + if (arpindex >= 0) { + if (!memcmp(nesadapter->arp_table[arpindex].mac_addr, + neigh->ha, ETH_ALEN)){ + /* Mac address same as in nes_arp_table */ + neigh_release(neigh); + ip_rt_put(rt); + return rc; + } + + nes_manage_arp_cache(nesvnic->netdev, + nesadapter->arp_table[arpindex].mac_addr, + dst_ip, NES_ARP_DELETE); + } + nes_manage_arp_cache(nesvnic->netdev, neigh->ha, dst_ip, NES_ARP_ADD); rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL, @@ -1134,7 +1150,6 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip) return rc; } - /** * make_cm_node - create a new instance of a cm node */ @@ -1144,6 +1159,7 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, { struct nes_cm_node *cm_node; struct timespec ts; + int oldarpindex = 0; int arpindex = 0; struct nes_device *nesdev; struct nes_adapter *nesadapter; @@ -1197,17 +1213,18 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, nesadapter = nesdev->nesadapter; cm_node->loopbackpartner = NULL; + /* get the mac addr for the remote node */ if (ipv4_is_loopback(htonl(cm_node->rem_addr))) arpindex = nes_arp_table(nesdev, ntohl(nesvnic->local_ipaddr), NULL, NES_ARP_RESOLVE); - else - arpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); + else { + oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr, oldarpindex); + + } if (arpindex < 0) { - arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr); - if (arpindex < 0) { - kfree(cm_node); - return NULL; - } + kfree(cm_node); + return NULL; } /* copy the mac addr to node context */ -- cgit v1.1 From 48617f862f9e58ca2a609fea6a76733aff55d672 Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Tue, 15 Dec 2009 23:39:10 -0800 Subject: RDMA/cxgb3: Fix error paths in post_send and post_recv Always set bad_wr when an immediate error is detected. Return ENOMEM for queue full instead of EINVAL to match other drivers. Signed-off-by: Frank Zago Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/iwch_qp.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 1cecf98..3eb8cec 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -365,18 +365,19 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qhp->lock, flag); if (qhp->attr.state > IWCH_QP_STATE_RTS) { spin_unlock_irqrestore(&qhp->lock, flag); - return -EINVAL; + err = -EINVAL; + goto out; } num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, qhp->wq.sq_size_log2); if (num_wrs <= 0) { spin_unlock_irqrestore(&qhp->lock, flag); - return -ENOMEM; + err = -ENOMEM; + goto out; } while (wr) { if (num_wrs == 0) { err = -ENOMEM; - *bad_wr = wr; break; } idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); @@ -428,10 +429,8 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wr->opcode); err = -EINVAL; } - if (err) { - *bad_wr = wr; + if (err) break; - } wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; sqp->wr_id = wr->wr_id; sqp->opcode = wr2opcode(t3_wr_opcode); @@ -454,6 +453,10 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } spin_unlock_irqrestore(&qhp->lock, flag); ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; return err; } @@ -471,18 +474,19 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, spin_lock_irqsave(&qhp->lock, flag); if (qhp->attr.state > IWCH_QP_STATE_RTS) { spin_unlock_irqrestore(&qhp->lock, flag); - return -EINVAL; + err = -EINVAL; + goto out; } num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, qhp->wq.rq_size_log2) - 1; if (!wr) { spin_unlock_irqrestore(&qhp->lock, flag); - return -EINVAL; + err = -ENOMEM; + goto out; } while (wr) { if (wr->num_sge > T3_MAX_SGE) { err = -EINVAL; - *bad_wr = wr; break; } idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); @@ -494,10 +498,10 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, err = build_zero_stag_recv(qhp, wqe, wr); else err = -ENOMEM; - if (err) { - *bad_wr = wr; + + if (err) break; - } + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP); @@ -511,6 +515,10 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, } spin_unlock_irqrestore(&qhp->lock, flag); ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; return err; } -- cgit v1.1