summaryrefslogtreecommitdiffstats
path: root/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/ofed/drivers/infiniband/hw/mlx4/mr.c')
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mr.c515
1 files changed, 422 insertions, 93 deletions
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
index c49b460..24d9520 100644
--- a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
@@ -31,6 +31,15 @@
* SOFTWARE.
*/
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+#ifdef __linux__
+#include <linux/proc_fs.h>
+#include <linux/cred.h>
+#endif
+
#include "mlx4_ib.h"
static u32 convert_access(int acc)
@@ -41,13 +50,67 @@ static u32 convert_access(int acc)
(acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) |
MLX4_PERM_LOCAL_READ;
}
+#ifdef __linux__
+static ssize_t shared_mr_proc_read(struct file *file,
+ char __user *buffer,
+ size_t len,
+ loff_t *offset)
+{
+
+ return -ENOSYS;
+
+}
+
+static ssize_t shared_mr_proc_write(struct file *file,
+ const char __user *buffer,
+ size_t len,
+ loff_t *offset)
+{
+
+ return -ENOSYS;
+}
+
+static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+
+ struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
+ struct mlx4_shared_mr_info *smr_info =
+ (struct mlx4_shared_mr_info *)pde->data;
+
+ /* Prevent any mapping not on start of area */
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ return ib_umem_map_to_vma(smr_info->umem,
+ vma);
+
+}
+
+static const struct file_operations shared_mr_proc_ops = {
+ .owner = THIS_MODULE,
+ .read = shared_mr_proc_read,
+ .write = shared_mr_proc_write,
+ .mmap = shared_mr_mmap
+};
+
+static mode_t convert_shared_access(int acc)
+{
+ return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR : 0) |
+ (acc & IB_ACCESS_SHARED_MR_USER_WRITE ? S_IWUSR : 0) |
+ (acc & IB_ACCESS_SHARED_MR_GROUP_READ ? S_IRGRP : 0) |
+ (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE ? S_IWGRP : 0) |
+ (acc & IB_ACCESS_SHARED_MR_OTHER_READ ? S_IROTH : 0) |
+ (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE ? S_IWOTH : 0);
+
+}
+#endif
struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
{
struct mlx4_ib_mr *mr;
int err;
- mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ mr = kzalloc(sizeof *mr, GFP_KERNEL);
if (!mr)
return ERR_PTR(-ENOMEM);
@@ -74,118 +137,350 @@ err_free:
return ERR_PTR(err);
}
+static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
+ struct mlx4_mtt *mtt,
+ u64 mtt_size,
+ u64 mtt_shift,
+ u64 len,
+ u64 cur_start_addr,
+ u64 *pages,
+ int *start_index,
+ int *npages)
+{
+ int k;
+ int err = 0;
+ u64 mtt_entries;
+ u64 cur_end_addr = cur_start_addr + len;
+ u64 cur_end_addr_aligned = 0;
+
+ len += (cur_start_addr & (mtt_size-1ULL));
+ cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
+ len += (cur_end_addr_aligned - cur_end_addr);
+ if (len & (mtt_size-1ULL)) {
+ WARN(1 ,
+ "write_block: len %llx is not aligned to mtt_size %llx\n",
+ len, mtt_size);
+ return -EINVAL;
+ }
+
+
+ mtt_entries = (len >> mtt_shift);
+
+ /* Align the MTT start address to
+ the mtt_size.
+ Required to handle cases when the MR
+ starts in the middle of an MTT record.
+ Was not required in old code since
+ the physical addresses provided by
+ the dma subsystem were page aligned,
+ which was also the MTT size.
+ */
+ cur_start_addr = round_down(cur_start_addr, mtt_size);
+ /* A new block is started ...*/
+ for (k = 0; k < mtt_entries; ++k) {
+ pages[*npages] = cur_start_addr + (mtt_size * k);
+ (*npages)++;
+ /*
+ * Be friendly to mlx4_write_mtt() and
+ * pass it chunks of appropriate size.
+ */
+ if (*npages == PAGE_SIZE / sizeof(u64)) {
+ err = mlx4_write_mtt(dev->dev,
+ mtt, *start_index,
+ *npages, pages);
+ if (err)
+ return err;
+
+ (*start_index) += *npages;
+ *npages = 0;
+ }
+ }
+
+ return 0;
+}
+
int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
struct ib_umem *umem)
{
u64 *pages;
struct ib_umem_chunk *chunk;
- int i, j, k;
- int n;
- int len;
+ int j;
+ u64 len = 0;
int err = 0;
+ u64 mtt_size;
+ u64 cur_start_addr = 0;
+ u64 mtt_shift;
+ int start_index = 0;
+ int npages = 0;
pages = (u64 *) __get_free_page(GFP_KERNEL);
if (!pages)
return -ENOMEM;
- i = n = 0;
+ mtt_shift = mtt->page_shift;
+ mtt_size = 1ULL << mtt_shift;
list_for_each_entry(chunk, &umem->chunk_list, list)
for (j = 0; j < chunk->nmap; ++j) {
- len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift;
- for (k = 0; k < len; ++k) {
- pages[i++] = sg_dma_address(&chunk->page_list[j]) +
- umem->page_size * k;
- /*
- * Be friendly to mlx4_write_mtt() and
- * pass it chunks of appropriate size.
- */
- if (i == PAGE_SIZE / sizeof (u64)) {
- err = mlx4_write_mtt(dev->dev, mtt, n,
- i, pages);
- if (err)
- goto out;
- n += i;
- i = 0;
- }
+ if (cur_start_addr + len ==
+ sg_dma_address(&chunk->page_list[j])) {
+ /* still the same block */
+ len += sg_dma_len(&chunk->page_list[j]);
+ continue;
}
+ /* A new block is started ...*/
+ /* If len is malaligned, write an extra mtt entry to
+ cover the misaligned area (round up the division)
+ */
+ err = mlx4_ib_umem_write_mtt_block(dev,
+ mtt, mtt_size, mtt_shift,
+ len, cur_start_addr,
+ pages,
+ &start_index,
+ &npages);
+ if (err)
+ goto out;
+
+ cur_start_addr =
+ sg_dma_address(&chunk->page_list[j]);
+ len = sg_dma_len(&chunk->page_list[j]);
}
- if (i)
- err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+ /* Handle the last block */
+ if (len > 0) {
+ /* If len is malaligned, write an extra mtt entry to cover
+ the misaligned area (round up the division)
+ */
+ err = mlx4_ib_umem_write_mtt_block(dev,
+ mtt, mtt_size, mtt_shift,
+ len, cur_start_addr,
+ pages,
+ &start_index,
+ &npages);
+ if (err)
+ goto out;
+ }
+
+
+ if (npages)
+ err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
out:
free_page((unsigned long) pages);
return err;
}
-static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr,
- u64 start, u64 virt_addr, int access_flags)
+static inline u64 alignment_of(u64 ptr)
{
-#if defined(CONFIG_HUGETLB_PAGE) && !defined(__powerpc__) && !defined(__ia64__)
- struct mlx4_ib_dev *dev = to_mdev(pd->device);
- struct ib_umem_chunk *chunk;
- unsigned dsize;
- dma_addr_t daddr;
- unsigned cur_size = 0;
- dma_addr_t uninitialized_var(cur_addr);
- int n;
- struct ib_umem *umem = mr->umem;
- u64 *arr;
- int err = 0;
- int i;
- int j = 0;
- int off = start & (HPAGE_SIZE - 1);
+ return ilog2(ptr & (~(ptr-1)));
+}
- n = DIV_ROUND_UP(off + umem->length, HPAGE_SIZE);
- arr = kmalloc(n * sizeof *arr, GFP_KERNEL);
- if (!arr)
- return -ENOMEM;
+static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
+ u64 current_block_end,
+ u64 block_shift)
+{
+ /* Check whether the alignment of the new block
+ is aligned as well as the previous block.
+ Block address must start with zeros till size of entity_size.
+ */
+ if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
+ /* It is not as well aligned as the
+ previous block-reduce the mtt size
+ accordingly.
+ Here we take the last right bit
+ which is 1.
+ */
+ block_shift = alignment_of(next_block_start);
+
+ /* Check whether the alignment of the
+ end of previous block - is it aligned
+ as well as the start of the block
+ */
+ if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
+ /* It is not as well aligned as
+ the start of the block - reduce the
+ mtt size accordingly.
+ */
+ block_shift = alignment_of(current_block_end);
+
+ return block_shift;
+}
- list_for_each_entry(chunk, &umem->chunk_list, list)
- for (i = 0; i < chunk->nmap; ++i) {
- daddr = sg_dma_address(&chunk->page_list[i]);
- dsize = sg_dma_len(&chunk->page_list[i]);
- if (!cur_size) {
- cur_addr = daddr;
- cur_size = dsize;
- } else if (cur_addr + cur_size != daddr) {
- err = -EINVAL;
- goto out;
- } else
- cur_size += dsize;
+/* Calculate optimal mtt size based on contiguous pages.
+* Function will return also the number of pages that are not aligned to the
+ calculated mtt_size to be added to total number
+ of pages. For that we should check the first chunk length & last chunk
+ length and if not aligned to mtt_size we should increment
+ the non_aligned_pages number.
+ All chunks in the middle already handled as part of mtt shift calculation
+ for both their start & end addresses.
+*/
+int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
+ u64 start_va,
+ int *num_of_mtts)
+{
+ struct ib_umem_chunk *chunk;
+ int j;
+ u64 block_shift = MLX4_MAX_MTT_SHIFT;
+ u64 current_block_len = 0;
+ u64 current_block_start = 0;
+ u64 misalignment_bits;
+ u64 first_block_start = 0;
+ u64 last_block_end = 0;
+ u64 total_len = 0;
+ u64 last_block_aligned_end = 0;
+ u64 min_shift = ilog2(umem->page_size);
+
+ list_for_each_entry(chunk, &umem->chunk_list, list) {
+ /* Initialization - save the first chunk start as
+ the current_block_start - block means contiguous pages.
+ */
+ if (current_block_len == 0 && current_block_start == 0) {
+ first_block_start = current_block_start =
+ sg_dma_address(&chunk->page_list[0]);
+ /* Find the bits that are different between
+ the physical address and the virtual
+ address for the start of the MR.
+ */
+ /* umem_get aligned the start_va to a page
+ boundry. Therefore, we need to align the
+ start va to the same boundry */
+ /* misalignment_bits is needed to handle the
+ case of a single memory region. In this
+ case, the rest of the logic will not reduce
+ the block size. If we use a block size
+ which is bigger than the alignment of the
+ misalignment bits, we might use the virtual
+ page number instead of the physical page
+ number, resulting in access to the wrong
+ data. */
+ misalignment_bits =
+ (start_va & (~(((u64)(umem->page_size))-1ULL)))
+ ^ current_block_start;
+ block_shift = min(alignment_of(misalignment_bits)
+ , block_shift);
+ }
- if (cur_size > HPAGE_SIZE) {
- err = -EINVAL;
- goto out;
- } else if (cur_size == HPAGE_SIZE) {
- cur_size = 0;
- arr[j++] = cur_addr;
+ /* Go over the scatter entries in the current chunk, check
+ if they continue the previous scatter entry.
+ */
+ for (j = 0; j < chunk->nmap; ++j) {
+ u64 next_block_start =
+ sg_dma_address(&chunk->page_list[j]);
+ u64 current_block_end = current_block_start
+ + current_block_len;
+ /* If we have a split (non-contig.) between two block*/
+ if (current_block_end != next_block_start) {
+ block_shift = mlx4_ib_umem_calc_block_mtt(
+ next_block_start,
+ current_block_end,
+ block_shift);
+
+ /* If we reached the minimum shift for 4k
+ page we stop the loop.
+ */
+ if (block_shift <= min_shift)
+ goto end;
+
+ /* If not saved yet we are in first block -
+ we save the length of first block to
+ calculate the non_aligned_pages number at
+ * the end.
+ */
+ total_len += current_block_len;
+
+ /* Start a new block */
+ current_block_start = next_block_start;
+ current_block_len =
+ sg_dma_len(&chunk->page_list[j]);
+ continue;
}
+ /* The scatter entry is another part of
+ the current block, increase the block size
+ * An entry in the scatter can be larger than
+ 4k (page) as of dma mapping
+ which merge some blocks together.
+ */
+ current_block_len +=
+ sg_dma_len(&chunk->page_list[j]);
}
+ }
- if (cur_size) {
- arr[j++] = cur_addr;
+ /* Account for the last block in the total len */
+ total_len += current_block_len;
+ /* Add to the first block the misalignment that it suffers from.*/
+ total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
+ last_block_end = current_block_start+current_block_len;
+ last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
+ total_len += (last_block_aligned_end - last_block_end);
+
+ WARN((total_len & ((1ULL<<block_shift)-1ULL)),
+ " misaligned total length detected (%llu, %llu)!",
+ total_len, block_shift);
+
+ *num_of_mtts = total_len >> block_shift;
+end:
+ if (block_shift < min_shift) {
+ /* If shift is less than the min we set a WARN and
+ return the min shift.
+ */
+ WARN(1,
+ "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
+ block_shift);
+
+ block_shift = min_shift;
}
+ return block_shift;
+}
- err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, umem->length,
- convert_access(access_flags), n, HPAGE_SHIFT, &mr->mmr);
- if (err)
- goto out;
+#ifdef __linux__
+static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
+{
+ struct proc_dir_entry *mr_proc_entry;
+ mode_t mode = S_IFREG;
+ char name_buff[16];
+
+ mode |= convert_shared_access(access_flags);
+ sprintf(name_buff, "%X", mr_id);
+ mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
+ mr->smr_info->mr_id = mr_id;
+ mr->smr_info->umem = mr->umem;
+
+ mr_proc_entry = proc_create_data(name_buff, mode,
+ mlx4_mrs_dir_entry,
+ &shared_mr_proc_ops,
+ mr->smr_info);
+
+ if (!mr_proc_entry) {
+ pr_err("prepare_shared_mr failed via proc\n");
+ kfree(mr->smr_info);
+ return -ENODEV;
+ }
- err = mlx4_write_mtt(dev->dev, &mr->mmr.mtt, 0, n, arr);
+ current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
+ mr_proc_entry->size = mr->umem->length;
+ return 0;
-out:
- kfree(arr);
- return err;
-#else
- return -ENOSYS;
-#endif
}
+static int is_shared_mr(int access_flags)
+{
+ /* We should check whether IB_ACCESS_SHARED_MR_USER_READ or
+ other shared bits were turned on.
+ */
+ return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ |
+ IB_ACCESS_SHARED_MR_USER_WRITE |
+ IB_ACCESS_SHARED_MR_GROUP_READ |
+ IB_ACCESS_SHARED_MR_GROUP_WRITE |
+ IB_ACCESS_SHARED_MR_OTHER_READ |
+ IB_ACCESS_SHARED_MR_OTHER_WRITE));
+
+}
+#endif
struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
- struct ib_udata *udata)
+ struct ib_udata *udata,
+ int mr_id)
{
struct mlx4_ib_dev *dev = to_mdev(pd->device);
struct mlx4_ib_mr *mr;
@@ -193,38 +488,49 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
int err;
int n;
- mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ mr = kzalloc(sizeof *mr, GFP_KERNEL);
if (!mr)
return ERR_PTR(-ENOMEM);
mr->umem = ib_umem_get(pd->uobject->context, start, length,
- access_flags, 0);
+ access_flags, 0);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
goto err_free;
}
- if (!mr->umem->hugetlb ||
- handle_hugetlb_user_mr(pd, mr, start, virt_addr, access_flags)) {
- n = ib_umem_page_count(mr->umem);
- shift = ilog2(mr->umem->page_size);
-
- err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
- convert_access(access_flags), n, shift, &mr->mmr);
- if (err)
- goto err_umem;
+ n = ib_umem_page_count(mr->umem);
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
+ &n);
+ err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+ convert_access(access_flags), n, shift, &mr->mmr);
+ if (err)
+ goto err_umem;
- err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
- if (err)
- goto err_mr;
- }
+ err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+ if (err)
+ goto err_mr;
err = mlx4_mr_enable(dev->dev, &mr->mmr);
if (err)
goto err_mr;
mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+#ifdef __linux__
+ /* Check whether MR should be shared */
+ if (is_shared_mr(access_flags)) {
+ /* start address and length must be aligned to page size in order
+ to map a full page and preventing leakage of data */
+ if (mr->umem->offset || (length & ~PAGE_MASK)) {
+ err = -EINVAL;
+ goto err_mr;
+ }
+ err = prepare_shared_mr(mr, access_flags, mr_id);
+ if (err)
+ goto err_mr;
+ }
+#endif
return &mr->ibmr;
err_mr:
@@ -239,13 +545,36 @@ err_free:
return ERR_PTR(err);
}
+
int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx4_ib_mr *mr = to_mmr(ibmr);
mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+ if (mr->smr_info) {
+ /* When master/parent shared mr is dereged there is
+ no ability to share this mr any more - its mr_id will be
+ returned to the kernel as part of ib_uverbs_dereg_mr
+ and may be allocated again as part of other reg_mr.
+ */
+ char name_buff[16];
+
+ sprintf(name_buff, "%X", mr->smr_info->mr_id);
+ /* Remove proc entry is checking internally that no operation
+ was strated on that proc fs file and if in the middle
+ current process will wait till end of operation.
+ That's why no sync mechanism is needed when we release
+ below the shared umem.
+ */
+#ifdef __linux__
+ remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
+ kfree(mr->smr_info);
+#endif
+ }
+
if (mr->umem)
ib_umem_release(mr->umem);
+
kfree(mr);
return 0;
@@ -258,7 +587,7 @@ struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
struct mlx4_ib_mr *mr;
int err;
- mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ mr = kzalloc(sizeof *mr, GFP_KERNEL);
if (!mr)
return ERR_PTR(-ENOMEM);
@@ -291,7 +620,7 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device
struct mlx4_ib_fast_reg_page_list *mfrpl;
int size = page_list_len * sizeof (u64);
- if (page_list_len > MAX_FAST_REG_PAGES)
+ if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
return ERR_PTR(-EINVAL);
mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
@@ -403,7 +732,7 @@ int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
err = mlx4_SYNC_TPT(mdev);
if (err)
- printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when "
+ pr_warn("SYNC_TPT error %d when "
"unmapping FMRs\n", err);
return 0;
OpenPOWER on IntegriCloud