From c1ccaf2478f84c2665cf57f981db143aa582d646 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 12 Nov 2009 11:32:27 -0800 Subject: IB/iser: Rewrite SG handling for RDMA logic After dma-mapping an SG list provided by the SCSI midlayer, iser has to make sure the mapped SG is "aligned for RDMA" in the sense that its possible to produce one mapping in the HCA IOMMU which represents the whole SG. Next, the mapped SG is formatted for registration with the HCA. This patch re-writes the logic that does the above, to make it clearer and simpler. It also fixes a bug in the being aligned for RDMA checks, where a "start" check wasn't done but rather only "end" check. Signed-off-by: Alexander Nezhinsky Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_memory.c | 122 ++++++++++++++---------------- 1 file changed, 56 insertions(+), 66 deletions(-) (limited to 'drivers/infiniband/ulp') diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index b9453d0..274c883 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -209,6 +209,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, mem_copy->copy_buf = NULL; } +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) + /** * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses * and returns the length of resulting physical address array (may be less than @@ -221,62 +223,52 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, * where --few fragments of the same page-- are present in the SG as * consecutive elements. Also, it handles one entry SG. */ + static int iser_sg_to_page_vec(struct iser_data_buf *data, struct iser_page_vec *page_vec, struct ib_device *ibdev) { - struct scatterlist *sgl = (struct scatterlist *)data->buf; - struct scatterlist *sg; - u64 first_addr, last_addr, page; - int end_aligned; - unsigned int cur_page = 0; + struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; + u64 start_addr, end_addr, page, chunk_start = 0; unsigned long total_sz = 0; - int i; + unsigned int dma_len; + int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; /* compute the offset of first element */ page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; + new_chunk = 1; + cur_page = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - + start_addr = ib_sg_dma_address(ibdev, sg); + if (new_chunk) + chunk_start = start_addr; + dma_len = ib_sg_dma_len(ibdev, sg); + end_addr = start_addr + dma_len; total_sz += dma_len; - first_addr = ib_sg_dma_address(ibdev, sg); - last_addr = first_addr + dma_len; - - end_aligned = !(last_addr & ~MASK_4K); - - /* continue to collect page fragments till aligned or SG ends */ - while (!end_aligned && (i + 1 < data->dma_nents)) { - sg = sg_next(sg); - i++; - dma_len = ib_sg_dma_len(ibdev, sg); - total_sz += dma_len; - last_addr = ib_sg_dma_address(ibdev, sg) + dma_len; - end_aligned = !(last_addr & ~MASK_4K); + /* collect page fragments until aligned or end of SG list */ + if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { + new_chunk = 0; + continue; } - - /* handle the 1st page in the 1st DMA element */ - if (cur_page == 0) { - page = first_addr & MASK_4K; - page_vec->pages[cur_page] = page; - cur_page++; + new_chunk = 1; + + /* address of the first page in the contiguous chunk; + masking relevant for the very first SG entry, + which might be unaligned */ + page = chunk_start & MASK_4K; + do { + page_vec->pages[cur_page++] = page; page += SIZE_4K; - } else - page = first_addr; - - for (; page < last_addr; page += SIZE_4K) { - page_vec->pages[cur_page] = page; - cur_page++; - } - + } while (page < end_addr); } + page_vec->data_size = total_sz; iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); return cur_page; } -#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) /** * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned @@ -284,42 +276,40 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, * the number of entries which are aligned correctly. Supports the case where * consecutive SG elements are actually fragments of the same physcial page. */ -static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data, - struct ib_device *ibdev) +static int iser_data_buf_aligned_len(struct iser_data_buf *data, + struct ib_device *ibdev) { - struct scatterlist *sgl, *sg; - u64 end_addr, next_addr; - int i, cnt; - unsigned int ret_len = 0; + struct scatterlist *sgl, *sg, *next_sg = NULL; + u64 start_addr, end_addr; + int i, ret_len, start_check = 0; + + if (data->dma_nents == 1) + return 1; sgl = (struct scatterlist *)data->buf; + start_addr = ib_sg_dma_address(ibdev, sgl); - cnt = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " - "offset: %ld sz: %ld\n", i, - (unsigned long)sg_phys(sg), - (unsigned long)sg->offset, - (unsigned long)sg->length); */ - end_addr = ib_sg_dma_address(ibdev, sg) + - ib_sg_dma_len(ibdev, sg); - /* iser_dbg("Checking sg iobuf end address " - "0x%08lX\n", end_addr); */ - if (i + 1 < data->dma_nents) { - next_addr = ib_sg_dma_address(ibdev, sg_next(sg)); - /* are i, i+1 fragments of the same page? */ - if (end_addr == next_addr) { - cnt++; - continue; - } else if (!IS_4K_ALIGNED(end_addr)) { - ret_len = cnt + 1; - break; - } - } - cnt++; + if (start_check && !IS_4K_ALIGNED(start_addr)) + break; + + next_sg = sg_next(sg); + if (!next_sg) + break; + + end_addr = start_addr + ib_sg_dma_len(ibdev, sg); + start_addr = ib_sg_dma_address(ibdev, next_sg); + + if (end_addr == start_addr) { + start_check = 0; + continue; + } else + start_check = 1; + + if (!IS_4K_ALIGNED(end_addr)) + break; } - if (i == data->dma_nents) - ret_len = cnt; /* loop ended */ + ret_len = (next_sg) ? i : i+1; iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", ret_len, data->dma_nents, data); return ret_len; -- cgit v1.1 From 0cd4d0fd9b0a4e10c091fc6316d1bf92885dcd9c Mon Sep 17 00:00:00 2001 From: "David J. Wilder" Date: Wed, 9 Dec 2009 10:03:00 -0800 Subject: IPoIB: Clear ipoib_neigh.dgid in ipoib_neigh_alloc() IPoIB can miss a change in destination GID under some conditions. The problem is caused when ipoib_neigh->dgid contains a stale address. The fix is to set ipoib_neigh->dgid to zero in ipoib_neigh_alloc(). This can happen when a system using bonding on its IPoIB interfaces has switched its active interface from interface A to B and back to A. The system that fails over will not correctly processes the 2nd address change, as described below. When an address has changed neighbor->ha is updated with the new address. Each neighbor has an associated ipoib_neigh. ipoib_neigh->dgid also holds a copy of the remote node's hardware address. When an address changes neighbor->ha is updated by the network layer (arp code) with the new address. IPoIB detects this change in ipoib_start_xmit() by comparing neighbor->ha with ipoib_neigh->dgid. The bug is that ipoib_neigh->dgid may already contain the new address (A) thus the change from B to A is missed by ipoib. Here is the sequence of events: ipoib_neigh->dgid = A and neighbor->ha = A The address is switched to B (the first switch) neighbor->ha = B The change is seen in ipoib_start_xmit() -- neighbor->ha != ipoib_neigh->dgid so ipoib_neigh is released, and a new one is allocated. The allocator may return the same chunk of memory that was just released, therefore ipoib_neigh->dgid still contains A at this point. ipoib_neigh->dgid should be updated in neigh_add_path(), but if the following conditions are true dgid is not updated: 1) __path_find() returns a path 2) path->ah is NULL The remote system now switches from address B to A, neighbor->ha is updated to A. Now we have again : ipoib_neigh->dgid = A and neighbor->ha = A Since the addresses are the same ipoib won't process the change in address. Fix this by zeroing out the dgid field when allocating a new struct ipoib_neigh. Signed-off-by: David Wilder Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/ulp') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 2bf5116..df3eb8c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -884,6 +884,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, neigh->neighbour = neighbour; neigh->dev = dev; + memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); *to_ipoib_neigh(neighbour) = neigh; skb_queue_head_init(&neigh->queue); ipoib_cm_set(neigh, NULL); -- cgit v1.1