From d541e45500bd269060c26387902e1bec9783c07c Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:43 -0400 Subject: IB/core: Convert ah_attr from OPA to IB when copying to user OPA address handle atttibutes that have 32 bit LIDs would have to be converted to IB address handle attribute with the LID field programmed in the GID before copying to user space. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Don Hiatt Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/ucm.c | 2 +- drivers/infiniband/core/ucma.c | 10 ++++--- drivers/infiniband/core/uverbs_marshall.c | 48 +++++++++++++++++++++++++++---- include/rdma/ib_marshall.h | 6 ++-- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 112099c..f2a7f62 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -618,7 +618,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, if (result) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 276f0ef..eb85b54 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -248,14 +248,15 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, dst->qp_num = src->qp_num; } -static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst, +static void ucma_copy_ud_event(struct ib_device *device, + struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } @@ -335,7 +336,8 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, uevent->resp.event = event->event; uevent->resp.status = event->status; if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); + ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, + &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); @@ -1157,7 +1159,7 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, if (ret) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 94fd989..bd0acf3 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -33,10 +33,47 @@ #include #include -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, - struct rdma_ah_attr *src) +#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +static int rdma_ah_conv_opa_to_ib(struct ib_device *dev, + struct rdma_ah_attr *ib, + struct rdma_ah_attr *opa) { + struct ib_port_attr port_attr; + int ret = 0; + + /* Do structure copy and the over-write fields */ + *ib = *opa; + + ib->type = RDMA_AH_ATTR_TYPE_IB; + rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0); + + if (ib_query_port(dev, opa->port_num, &port_attr)) { + /* Set to default subnet to indicate error */ + rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX); + ret = -EINVAL; + } else { + rdma_ah_set_subnet_prefix(ib, + cpu_to_be64(port_attr.subnet_prefix)); + } + rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa))); + return ret; +} + +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, + struct rdma_ah_attr *ah_attr) +{ + struct rdma_ah_attr *src = ah_attr; + struct rdma_ah_attr conv_ah; + memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr))) + src = &conv_ah; + dst->dlid = rdma_ah_get_dlid(src); dst->sl = rdma_ah_get_sl(src); dst->src_path_bits = rdma_ah_get_path_bits(src); @@ -57,7 +94,8 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { dst->qp_state = src->qp_state; @@ -76,8 +114,8 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, dst->max_recv_sge = src->cap.max_recv_sge; dst->max_inline_data = src->cap.max_inline_data; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); - ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr); dst->pkey_index = src->pkey_index; dst->alt_pkey_index = src->alt_pkey_index; diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h index 68cef3b..8ebf84a 100644 --- a/include/rdma/ib_marshall.h +++ b/include/rdma/ib_marshall.h @@ -38,10 +38,12 @@ #include #include -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src); -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, struct rdma_ah_attr *src); void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, -- cgit v1.1 From 4c4736905e456819223c6e879ca8fe7cce877939 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:44 -0400 Subject: IB/srpt: Increase lid and sm_lid to 32 bits srpt contains lid and sm_lid fields which are 16 bits in length, increase them to 32 bits. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index cc118385..1b817e5 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -328,8 +328,8 @@ struct srpt_port { u8 port_guid[24]; u8 port_gid[64]; u8 port; - u16 sm_lid; - u16 lid; + u32 sm_lid; + u32 lid; union ib_gid gid; struct work_struct work; struct se_portal_group port_guid_tpg; -- cgit v1.1 From 7e93e2cb835c4301663c1ac5e0135d2e06e1c5e8 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:45 -0400 Subject: IB/IPoIB: Increase local_lid to 32 bits IPoIB contains local_lid field which is 16 bits in length, increase it to 32 bits. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index ff50a7b..9e73810 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -366,7 +366,7 @@ struct ipoib_dev_priv { u32 qkey; union ib_gid local_gid; - u16 local_lid; + u32 local_lid; unsigned int admin_mtu; unsigned int mcast_mtu; -- cgit v1.1 From 1cb2fc0db764dae2c484dac5c93824003fe571fb Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:46 -0400 Subject: IB/mad: Change slid in RMPP recv from 16 to 32 bits MAD RMPP contains slid field which is 16 bits in length, increase it to 32 bits. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad_rmpp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 0d3cca0..e5cf09c 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -64,7 +64,7 @@ struct mad_rmpp_recv { __be64 tid; u32 src_qp; - u16 slid; + u32 slid; u8 mgmt_class; u8 class_version; u8 method; -- cgit v1.1 From 582faf3150f57b8364ac9d2aa731d7368ada7a4b Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:47 -0400 Subject: IB/core: Change port_attr.lid size from 16 to 32 bits lid field in struct ib_port_attr is increased to 32 bits. This enables core components to use larger LIDs if needed. The user ABI is unchanged and return 16 bit values when queried. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 5 ++++- drivers/infiniband/hw/mlx4/alias_GUID.c | 2 +- drivers/infiniband/hw/mlx4/mad.c | 2 +- drivers/infiniband/hw/mthca/mthca_mad.c | 2 +- include/rdma/ib_verbs.h | 2 +- include/rdma/opa_addr.h | 3 ++- 7 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 11ae675..6b54280 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,7 @@ #include #include +#include #include #include "mad_priv.h" diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2c98533..eef2623 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -275,8 +275,11 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; - resp.lid = attr.lid; resp.sm_lid = attr.sm_lid; + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) + resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); + else + resp.lid = (u16)attr.lid; resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index ea24230..5a897b0 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -528,7 +528,7 @@ static int set_guid_rec(struct ib_device *ibdev, memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); - guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.lid = cpu_to_be16((u16)attr.lid); guid_info_rec.block_num = index; memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 21d31cb..00f0570 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -860,7 +860,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = pattr.lid; + prev_lid = (u16)pattr.lid; err = mlx4_MAD_IFC(to_mdev(ibdev), (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 7df3db7..617531f 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -256,7 +256,7 @@ int mthca_process_mad(struct ib_device *ibdev, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = pattr.lid; + prev_lid = (u16)pattr.lid; err = mthca_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b573243..4fa94e6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -549,8 +549,8 @@ struct ib_port_attr { u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; - u16 lid; u16 sm_lid; + u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index eace28f..46d0567 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -50,7 +50,8 @@ #define OPA_SPECIAL_OUI (0x00066AULL) #define OPA_MAKE_ID(x) (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x))) - +#define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \ + ? 0 : x) /** * ib_is_opa_gid: Returns true if the top 24 bits of the gid * contains the OPA_STL_OUI identifier. This identifies that -- cgit v1.1 From db58540b021a17e0ede64f761b740556d77f1679 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:48 -0400 Subject: IB/core: Change port_attr.sm_lid from 16 to 32 bits sm_lid field in struct ib_port_attr is increased to 32 bits. This enables core components to use larger LIDs if needed. The user ABI is unchanged and return 16 bit values when queried. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 8 +++++--- include/rdma/ib_verbs.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index eef2623..01e2ff0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -275,11 +275,13 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; - resp.sm_lid = attr.sm_lid; - if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) { resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); - else + resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid); + } else { resp.lid = (u16)attr.lid; + resp.sm_lid = (u16)attr.sm_lid; + } resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4fa94e6..6205359 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -549,7 +549,7 @@ struct ib_port_attr { u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; - u16 sm_lid; + u32 sm_lid; u32 lid; u8 lmc; u8 max_vl_num; -- cgit v1.1 From 7db20ecd1d9700e2c240dee505162eb56ab55b5b Mon Sep 17 00:00:00 2001 From: "Hiatt, Don" Date: Thu, 8 Jun 2017 13:37:49 -0400 Subject: IB/core: Change wc.slid from 16 to 32 bits slid field in struct ib_wc is increased to 32 bits. This enables core components to use larger LIDs if needed. The user ABI is unchanged and return 16 bit values when queried. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 4 ++-- drivers/infiniband/core/user_mad.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 10 +++++++--- drivers/infiniband/hw/hfi1/mad.c | 2 +- drivers/infiniband/hw/mlx4/mad.c | 6 +++--- drivers/infiniband/hw/mlx5/mad.c | 2 +- drivers/infiniband/hw/mthca/mthca_cmd.c | 4 ++-- drivers/infiniband/hw/mthca/mthca_mad.c | 2 +- drivers/infiniband/sw/rdmavt/cq.c | 2 +- include/rdma/ib_verbs.h | 14 +++++++++++++- 10 files changed, 32 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2b4d613..b39ee16 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1703,7 +1703,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { - req_msg->primary_local_lid = cpu_to_be16(wc->slid); + req_msg->primary_local_lid = ib_slid_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } @@ -1713,7 +1713,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { - req_msg->alt_local_lid = cpu_to_be16(wc->slid); + req_msg->alt_local_lid = ib_slid_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 36a6f5c..ff3c67a 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); - packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.lid = ib_slid_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 01e2ff0..eb0da37 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1190,7 +1190,8 @@ out: return ret ? ret : in_len; } -static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) +static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, + struct ib_wc *wc) { struct ib_uverbs_wc tmp; @@ -1204,7 +1205,10 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; - tmp.slid = wc->slid; + if (rdma_cap_opa_ah(ib_dev, wc->port_num)) + tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); + else + tmp.slid = ib_slid_cpu16(wc->slid); tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; @@ -1248,7 +1252,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (!ret) break; - ret = copy_wc_to_user(data_ptr, &wc); + ret = copy_wc_to_user(ib_dev, data_ptr, &wc); if (ret) goto out_put; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 5977673..00ebc26 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -3958,7 +3958,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp, const struct ib_wc *in_wc) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u16 slid = in_wc->slid; + u16 slid = ib_slid_cpu16(in_wc->slid); u16 pkey; if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys)) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 00f0570..04fb44e 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -169,7 +169,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, op_modifier |= 0x4; - in_modifier |= in_wc->slid << 16; + in_modifier |= ib_slid_cpu16(in_wc->slid) << 16; } err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, @@ -625,7 +625,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); } else { tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + tun_mad->hdr.slid_mac_47_32 = ib_slid_be16(wc->slid); } ib_dma_sync_single_for_device(&dev->ib_dev, @@ -826,7 +826,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, } } - slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { forward_trap(to_mdev(ibdev), port_num, in_mad); diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 95db929..cd2264a 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -78,7 +78,7 @@ static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, u16 slid; int err; - slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 9d83a53..e19ae0b 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1921,7 +1921,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET); - MTHCA_PUT(inbox, in_wc->slid, MAD_IFC_RLID_OFFSET); + MTHCA_PUT(inbox, ib_slid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET); MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET); if (in_grh) @@ -1929,7 +1929,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, op_modifier |= 0x4; - in_modifier |= in_wc->slid << 16; + in_modifier |= ib_slid_cpu16(in_wc->slid) << 16; } err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma, diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 617531f..a9caada 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -205,7 +205,7 @@ int mthca_process_mad(struct ib_device *ibdev, u16 *out_mad_pkey_index) { int err; - u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + u16 slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); u16 prev_lid = 0; struct ib_port_attr pattr; const struct ib_mad *in_mad = (const struct ib_mad *)in; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 0ae2ff8..0335a3d 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -107,7 +107,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) wc->uqueue[head].src_qp = entry->src_qp; wc->uqueue[head].wc_flags = entry->wc_flags; wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].slid = ib_slid_cpu16(entry->slid); wc->uqueue[head].sl = entry->sl; wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; wc->uqueue[head].port_num = entry->port_num; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6205359..7eaf7d2 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -948,7 +948,7 @@ struct ib_wc { u32 src_qp; int wc_flags; u16 pkey_index; - u16 slid; + u32 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ @@ -3706,4 +3706,16 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, else return RDMA_AH_ATTR_TYPE_IB; } + +/* Return slid in 16bit CPU encoding */ +static inline u16 ib_slid_cpu16(u32 slid) +{ + return (u16)slid; +} + +/* Return slid in 16bit BE encoding */ +static inline u16 ib_slid_be16(u32 slid) +{ + return cpu_to_be16((u16)slid); +} #endif /* IB_VERBS_H */ -- cgit v1.1 From e92aa00a518971fca6b79aa87a1a9c5e5aa51f3b Mon Sep 17 00:00:00 2001 From: "Hiatt, Don" Date: Thu, 8 Jun 2017 13:38:02 -0400 Subject: IB/CM: Add OPA Path record support to CM Add OPA path record support to the Connection Manager. Signed-off-by: Don Hiatt Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 50 +++++++++++++++++++++++++++++++++++++------- include/rdma/opa_addr.h | 18 ++++++++++++++++ 2 files changed, 60 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index b39ee16..885c429 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1175,6 +1175,11 @@ static void cm_format_req(struct cm_req_msg *req_msg, { struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; + bool pri_ext = false; + + if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) + pri_ext = opa_is_extended_lid(pri_path->opa.dlid, + pri_path->opa.slid); cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -1202,18 +1207,24 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_srq(req_msg, param->srq); } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + if (pri_ext) { + req_msg->primary_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); + req_msg->primary_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); + } if (pri_path->hop_limit <= 1) { - req_msg->primary_local_lid = + req_msg->primary_local_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_slid(pri_path))); - req_msg->primary_remote_lid = + req_msg->primary_remote_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_dlid(pri_path))); } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; req_msg->primary_remote_lid = IB_LID_PERMISSIVE; } - req_msg->primary_local_gid = pri_path->sgid; - req_msg->primary_remote_gid = pri_path->dgid; cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); cm_req_set_primary_packet_rate(req_msg, pri_path->rate); req_msg->primary_traffic_class = pri_path->traffic_class; @@ -1225,17 +1236,29 @@ static void cm_format_req(struct cm_req_msg *req_msg, pri_path->packet_life_time)); if (alt_path) { + bool alt_ext = false; + + if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alt_path->opa.dlid, + alt_path->opa.slid); + + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; + if (alt_ext) { + req_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); + req_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); + } if (alt_path->hop_limit <= 1) { - req_msg->alt_local_lid = + req_msg->alt_local_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_slid(alt_path))); - req_msg->alt_remote_lid = + req_msg->alt_remote_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_dlid(alt_path))); } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; } - req_msg->alt_local_gid = alt_path->sgid; - req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, alt_path->flow_label); cm_req_set_alt_packet_rate(req_msg, alt_path->rate); @@ -2843,6 +2866,11 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, const void *private_data, u8 private_data_len) { + bool alt_ext = false; + + if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alternate_path->opa.dlid, + alternate_path->opa.slid); cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); lap_msg->local_comm_id = cm_id_priv->id.local_id; @@ -2856,6 +2884,12 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, htons(ntohl(sa_path_get_dlid(alternate_path))); lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; + if (alt_ext) { + lap_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid)); + lap_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid)); + } cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); lap_msg->alt_hop_limit = alternate_path->hop_limit; diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index 46d0567..9b5e642 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -77,4 +77,22 @@ static inline u32 opa_get_lid_from_gid(union ib_gid *gid) { return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF; } + +/** + * opa_is_extended_lid: Returns true if dlid or slid are + * extended. + * + * @dlid: The DLID + * @slid: The SLID + */ +static inline bool opa_is_extended_lid(u32 dlid, u32 slid) +{ + if ((be32_to_cpu(dlid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) || + (be32_to_cpu(slid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE))) + return true; + else + return false; +} #endif /* OPA_ADDR_H */ -- cgit v1.1 From 6b3c0e6e6d5abfefb0112cd450e0aee97fcab7a8 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:38:03 -0400 Subject: IB/CM: Create appropriate path records when handling CM request When handling an incoming conection request, ib_cm creates either an IB or an OPA path record based on the gid field in the request. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Don Hiatt Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 885c429..4d870a0 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1428,6 +1428,21 @@ static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); } +static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) +{ + return ((req_msg->alt_local_lid) || + (ib_is_opa_gid(&req_msg->alt_local_gid))); +} + +static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, + struct sa_path_rec *path, union ib_gid *gid) +{ + if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) + path->rec_type = SA_PATH_REC_TYPE_OPA; + else + path->rec_type = SA_PATH_REC_TYPE_IB; +} + static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, struct sa_path_rec *alt_path) @@ -1807,9 +1822,12 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); @@ -1834,16 +1852,19 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, cm_id_priv); if (ret) { @@ -2962,8 +2983,6 @@ static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { - memset(path, 0, sizeof *path); - path->rec_type = SA_PATH_REC_TYPE_IB; path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); @@ -2999,6 +3018,11 @@ static int cm_lap_handler(struct cm_work *work) return -EINVAL; param = &work->cm_event.param.lap_rcvd; + memset(&work->path[0], 0, sizeof(work->path[1])); + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &lap_msg->alt_local_gid); param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); work->cm_event.private_data = &lap_msg->private_data; -- cgit v1.1 From ac3a949fb2fff36bebdc4fab90567ed349ea7245 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:38:04 -0400 Subject: IB/CM: Set appropriate slid and dlid when handling CM request If extended LIDs are being used, a connection request contains OPA GIDs in them. Extract the lids from the OPA gids and populate slid/dlid fields in the path records that are created when handling a connection request. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Don Hiatt Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 67 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 4d870a0..d5ca101 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1443,16 +1443,48 @@ static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, path->rec_type = SA_PATH_REC_TYPE_IB; } +static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, + struct sa_path_rec *primary_path, + struct sa_path_rec *alt_path) +{ + u32 lid; + + if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(primary_path, + htonl(ntohs(req_msg->primary_local_lid))); + sa_path_set_slid(primary_path, + htonl(ntohs(req_msg->primary_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->primary_local_gid); + sa_path_set_dlid(primary_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->primary_remote_gid); + sa_path_set_slid(primary_path, cpu_to_be32(lid)); + } + + if (!cm_req_has_alt_path(req_msg)) + return; + + if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(alt_path, + htonl(ntohs(req_msg->alt_local_lid))); + sa_path_set_slid(alt_path, + htonl(ntohs(req_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->alt_local_gid); + sa_path_set_dlid(alt_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->alt_remote_gid); + sa_path_set_slid(alt_path, cpu_to_be32(lid)); + } +} + static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, struct sa_path_rec *alt_path) { primary_path->dgid = req_msg->primary_local_gid; primary_path->sgid = req_msg->primary_remote_gid; - sa_path_set_dlid(primary_path, - htonl(ntohs(req_msg->primary_local_lid))); - sa_path_set_slid(primary_path, - htonl(ntohs(req_msg->primary_remote_lid))); primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); primary_path->hop_limit = req_msg->primary_hop_limit; primary_path->traffic_class = req_msg->primary_traffic_class; @@ -1469,13 +1501,9 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, primary_path->packet_life_time -= (primary_path->packet_life_time > 0); primary_path->service_id = req_msg->service_id; - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { alt_path->dgid = req_msg->alt_local_gid; alt_path->sgid = req_msg->alt_remote_gid; - sa_path_set_dlid(alt_path, - htonl(ntohs(req_msg->alt_local_lid))); - sa_path_set_slid(alt_path, - htonl(ntohs(req_msg->alt_remote_lid))); alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); alt_path->hop_limit = req_msg->alt_hop_limit; alt_path->traffic_class = req_msg->alt_traffic_class; @@ -1492,6 +1520,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, alt_path->packet_life_time -= (alt_path->packet_life_time > 0); alt_path->service_id = req_msg->service_id; } + cm_format_path_lid_from_req(req_msg, primary_path, alt_path); } static u16 cm_get_bth_pkey(struct cm_work *work) @@ -2979,14 +3008,29 @@ out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_lap); +static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg, + struct sa_path_rec *path) +{ + u32 lid; + + if (path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); + sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&lap_msg->alt_local_gid); + sa_path_set_dlid(path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&lap_msg->alt_remote_gid); + sa_path_set_slid(path, cpu_to_be32(lid)); + } +} + static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; - sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); - sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); path->flow_label = cm_lap_get_flow_label(lap_msg); path->hop_limit = lap_msg->alt_hop_limit; path->traffic_class = cm_lap_get_traffic_class(lap_msg); @@ -3000,6 +3044,7 @@ static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, path->packet_life_time_selector = IB_SA_EQ; path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); path->packet_life_time -= (path->packet_life_time > 0); + cm_format_path_lid_from_lap(lap_msg, path); } static int cm_lap_handler(struct cm_work *work) -- cgit v1.1 From 78249c4215840edb95447ec6867b69a7ac1d7a0d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:38 +0300 Subject: mlx5: convert to generic pci_alloc_irq_vectors Now that we have a generic code to allocate an array of irq vectors and even correctly spread their affinity, correctly handle cpu hotplug events and more, were much better off using it. Reviewed-by: Christoph Hellwig Acked-by: Leon Romanovsky Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 9 ++--- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 39 +++++++++------------- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 - include/linux/mlx5/driver.h | 1 - 7 files changed, 20 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1eac500..d0e572d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -397,7 +397,7 @@ static void mlx5e_enable_async_events(struct mlx5e_priv *priv) static void mlx5e_disable_async_events(struct mlx5e_priv *priv) { clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state); - synchronize_irq(mlx5_get_msix_vec(priv->mdev, MLX5_EQ_VEC_ASYNC)); + synchronize_irq(pci_irq_vector(priv->mdev->pdev, MLX5_EQ_VEC_ASYNC)); } static inline int mlx5e_get_wqe_mtt_sz(void) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index af51a5d..8a09d71 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -585,7 +585,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, name, pci_name(dev->pdev)); eq->eqn = MLX5_GET(create_eq_out, out, eq_number); - eq->irqn = priv->msix_arr[vecidx].vector; + eq->irqn = pci_irq_vector(dev->pdev, vecidx); eq->dev = dev; eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET; err = request_irq(eq->irqn, handler, 0, @@ -620,7 +620,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, return 0; err_irq: - free_irq(priv->msix_arr[vecidx].vector, eq); + free_irq(eq->irqn, eq); err_eq: mlx5_cmd_destroy_eq(dev, eq->eqn); @@ -661,11 +661,6 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) } EXPORT_SYMBOL_GPL(mlx5_destroy_unmap_eq); -u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx) -{ - return dev->priv.msix_arr[MLX5_EQ_VEC_ASYNC].vector; -} - int mlx5_eq_init(struct mlx5_core_dev *dev) { int err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 89bfda4..1ce2543 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1585,7 +1585,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num) /* Mark this vport as disabled to discard new events */ vport->enabled = false; - synchronize_irq(mlx5_get_msix_vec(esw->dev, MLX5_EQ_VEC_ASYNC)); + synchronize_irq(pci_irq_vector(esw->dev->pdev, MLX5_EQ_VEC_ASYNC)); /* Wait for current already scheduled events to complete */ flush_workqueue(esw->work_queue); /* Disable events from this vport */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 4b6b03d..8aea0a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -81,7 +81,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev) u64 vector; /* wait for pending handlers to complete */ - synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); + synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD)); spin_lock_irqsave(&dev->cmd.alloc_lock, flags); vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); if (!vector) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c065132..d2fd55e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -312,13 +312,12 @@ static void release_bar(struct pci_dev *pdev) pci_release_regions(pdev); } -static int mlx5_enable_msix(struct mlx5_core_dev *dev) +static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; - int i; nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; @@ -326,17 +325,13 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev) if (nvec <= MLX5_EQ_VEC_COMP_BASE) return -ENOMEM; - priv->msix_arr = kcalloc(nvec, sizeof(*priv->msix_arr), GFP_KERNEL); - priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL); - if (!priv->msix_arr || !priv->irq_info) + if (!priv->irq_info) goto err_free_msix; - for (i = 0; i < nvec; i++) - priv->msix_arr[i].entry = i; - - nvec = pci_enable_msix_range(dev->pdev, priv->msix_arr, - MLX5_EQ_VEC_COMP_BASE + 1, nvec); + nvec = pci_alloc_irq_vectors(dev->pdev, + MLX5_EQ_VEC_COMP_BASE + 1, nvec, + PCI_IRQ_MSIX); if (nvec < 0) return nvec; @@ -346,17 +341,15 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev) err_free_msix: kfree(priv->irq_info); - kfree(priv->msix_arr); return -ENOMEM; } -static void mlx5_disable_msix(struct mlx5_core_dev *dev) +static void mlx5_free_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; - pci_disable_msix(dev->pdev); + pci_free_irq_vectors(dev->pdev); kfree(priv->irq_info); - kfree(priv->msix_arr); } struct mlx5_reg_host_endianness { @@ -615,8 +608,7 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev) static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) { struct mlx5_priv *priv = &mdev->priv; - struct msix_entry *msix = priv->msix_arr; - int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); @@ -636,8 +628,7 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) { struct mlx5_priv *priv = &mdev->priv; - struct msix_entry *msix = priv->msix_arr; - int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); irq_set_affinity_hint(irq, NULL); free_cpumask_var(priv->irq_info[i].mask); @@ -760,8 +751,8 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev) } #ifdef CONFIG_RFS_ACCEL - irq_cpu_rmap_add(dev->rmap, - dev->priv.msix_arr[i + MLX5_EQ_VEC_COMP_BASE].vector); + irq_cpu_rmap_add(dev->rmap, pci_irq_vector(dev->pdev, + MLX5_EQ_VEC_COMP_BASE + i)); #endif snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i); err = mlx5_create_map_eq(dev, eq, @@ -1119,9 +1110,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_poll; } - err = mlx5_enable_msix(dev); + err = mlx5_alloc_irq_vectors(dev); if (err) { - dev_err(&pdev->dev, "enable msix failed\n"); + dev_err(&pdev->dev, "alloc irq vectors failed\n"); goto err_cleanup_once; } @@ -1220,7 +1211,7 @@ err_put_uars: mlx5_put_uars_page(dev, priv->uar); err_disable_msix: - mlx5_disable_msix(dev); + mlx5_free_irq_vectors(dev); err_cleanup_once: if (boot) @@ -1287,7 +1278,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_put_uars_page(dev, priv->uar); - mlx5_disable_msix(dev); + mlx5_free_irq_vectors(dev); if (cleanup) mlx5_cleanup_once(dev); mlx5_stop_health_poll(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 6a3d6be..ba1d494 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -110,7 +110,6 @@ int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, u32 element_id); int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev); u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev); -u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx); struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn); void mlx5_cq_tasklet_cb(unsigned long data); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index df6ce59..5bac7f5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -597,7 +597,6 @@ struct mlx5_port_module_event_stats { struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; - struct msix_entry *msix_arr; struct mlx5_irq_info *irq_info; /* pages stuff */ -- cgit v1.1 From a85e5474f4c783b252bf6b80571cdb2abb7d69d9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:39 +0300 Subject: mlx5e: don't assume anything on the irq affinity mappings of the device mlx5e currently assumes that irq affinity is really spread first irq vectors across device home node cpus, with the new generic affinity mappings this is no longer the case, hence mlxe should not rely on this anymore. Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d0e572d..2c4e4183 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3793,18 +3793,8 @@ void mlx5e_build_default_indir_rqt(struct mlx5_core_dev *mdev, u32 *indirection_rqt, int len, int num_channels) { - int node = mdev->priv.numa_node; - int node_num_of_cores; int i; - if (node == -1) - node = first_online_node; - - node_num_of_cores = cpumask_weight(cpumask_of_node(node)); - - if (node_num_of_cores) - num_channels = min_t(int, num_channels, node_num_of_cores); - for (i = 0; i < len; i++) indirection_rqt[i] = i % num_channels; } -- cgit v1.1 From a435393acafbf0ecff4deb3e3cb554b34f0d0664 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:40 +0300 Subject: mlx5: move affinity hints assignments to generic code generic api takes care of spreading affinity similar to what mlx5 open coded (and even handles better asymmetric configurations). Ask the generic API to spread affinity for us, and feed him pre_vectors that do not participate in affinity settings (which is an improvement to what we had before). The affinity assignments should match what mlx5 tried to do earlier but now we do not set affinity to async, cmd and pages dedicated vectors. Also, remove mlx5e_get_cpu and introduce mlx5e_get_node (used for allocation purposes) and mlx5_get_vector_affinity (for indirection table construction) as they provide the needed information. Luckily, we have generic helpers to get cpumask and node given a irq vector. mlx5_get_vector_affinity will be used by mlx5_ib in a subsequent patch. Reviewed-by: Christoph Hellwig Acked-by: Leon Romanovsky Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 45 +++++++------- drivers/net/ethernet/mellanox/mlx5/core/main.c | 75 ++--------------------- include/linux/mlx5/driver.h | 7 ++- 4 files changed, 35 insertions(+), 93 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e1b7ddf..9091232 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -587,7 +587,6 @@ struct mlx5e_channel { struct mlx5_core_dev *mdev; struct mlx5e_tstamp *tstamp; int ix; - int cpu; }; struct mlx5e_channels { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 2c4e4183..fb64756 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -71,6 +71,11 @@ struct mlx5e_channel_param { struct mlx5e_cq_param icosq_cq; }; +static int mlx5e_get_node(struct mlx5e_priv *priv, int ix) +{ + return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix); +} + static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { return MLX5_CAP_GEN(mdev, striding_rq) && @@ -444,16 +449,17 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int wq_sz = mlx5_wq_ll_get_size(&rq->wq); int mtt_sz = mlx5e_get_wqe_mtt_sz(); int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1; + int node = mlx5e_get_node(c->priv, c->ix); int i; rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info), - GFP_KERNEL, cpu_to_node(c->cpu)); + GFP_KERNEL, node); if (!rq->mpwqe.info) goto err_out; /* We allocate more than mtt_sz as we will align the pointer */ - rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL, - cpu_to_node(c->cpu)); + rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, + GFP_KERNEL, node); if (unlikely(!rq->mpwqe.mtt_no_align)) goto err_free_wqe_info; @@ -561,7 +567,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, int err; int i; - rqp->wq.db_numa_node = cpu_to_node(c->cpu); + rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); @@ -628,7 +634,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, default: /* MLX5_WQ_TYPE_LINKED_LIST */ rq->wqe.frag_info = kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info), - GFP_KERNEL, cpu_to_node(c->cpu)); + GFP_KERNEL, + mlx5e_get_node(c->priv, c->ix)); if (!rq->wqe.frag_info) { err = -ENOMEM; goto err_rq_wq_destroy; @@ -993,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1047,13 +1054,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, sq->channel = c; sq->uar_map = mdev->mlx5e_res.bfreg.map; - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1119,13 +1126,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, if (MLX5_IPSEC_DEV(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1497,8 +1504,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c, struct mlx5_core_dev *mdev = c->priv->mdev; int err; - param->wq.buf_numa_node = cpu_to_node(c->cpu); - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); param->eq_ix = c->ix; err = mlx5e_alloc_cq_common(mdev, param, cq); @@ -1597,11 +1604,6 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq) mlx5e_free_cq(cq); } -static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix) -{ - return cpumask_first(priv->mdev->priv.irq_info[ix].mask); -} - static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_channel_param *cparam) @@ -1750,11 +1752,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, { struct mlx5e_cq_moder icocq_moder = {0, 0}; struct net_device *netdev = priv->netdev; - int cpu = mlx5e_get_cpu(priv, ix); struct mlx5e_channel *c; int err; - c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu)); + c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix)); if (!c) return -ENOMEM; @@ -1762,7 +1763,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->mdev = priv->mdev; c->tstamp = &priv->tstamp; c->ix = ix; - c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); @@ -1848,7 +1848,8 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) for (tc = 0; tc < c->num_tc; tc++) mlx5e_activate_txqsq(&c->sq[tc]); mlx5e_activate_rq(&c->rq); - netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix); + netif_set_xps_queue(c->netdev, + mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix); } static void mlx5e_deactivate_channel(struct mlx5e_channel *c) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index d2fd55e..e464e81 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -316,6 +316,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; + struct irq_affinity irqdesc = { + .pre_vectors = MLX5_EQ_VEC_COMP_BASE, + }; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; @@ -329,9 +332,10 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) if (!priv->irq_info) goto err_free_msix; - nvec = pci_alloc_irq_vectors(dev->pdev, + nvec = pci_alloc_irq_vectors_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + 1, nvec, - PCI_IRQ_MSIX); + PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, + &irqdesc); if (nvec < 0) return nvec; @@ -605,63 +609,6 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev) return (u64)timer_l | (u64)timer_h1 << 32; } -static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) -{ - struct mlx5_priv *priv = &mdev->priv; - int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); - - if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { - mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); - return -ENOMEM; - } - - cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node), - priv->irq_info[i].mask); - - if (IS_ENABLED(CONFIG_SMP) && - irq_set_affinity_hint(irq, priv->irq_info[i].mask)) - mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq); - - return 0; -} - -static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) -{ - struct mlx5_priv *priv = &mdev->priv; - int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); - - irq_set_affinity_hint(irq, NULL); - free_cpumask_var(priv->irq_info[i].mask); -} - -static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev) -{ - int err; - int i; - - for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) { - err = mlx5_irq_set_affinity_hint(mdev, i); - if (err) - goto err_out; - } - - return 0; - -err_out: - for (i--; i >= 0; i--) - mlx5_irq_clear_affinity_hint(mdev, i); - - return err; -} - -static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev) -{ - int i; - - for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) - mlx5_irq_clear_affinity_hint(mdev, i); -} - int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, unsigned int *irqn) { @@ -1134,12 +1081,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_eqs; } - err = mlx5_irq_set_affinity_hints(dev); - if (err) { - dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n"); - goto err_affinity_hints; - } - err = mlx5_init_fs(dev); if (err) { dev_err(&pdev->dev, "Failed to init flow steering\n"); @@ -1199,9 +1140,6 @@ err_sriov: mlx5_cleanup_fs(dev); err_fs: - mlx5_irq_clear_affinity_hints(dev); - -err_affinity_hints: free_comp_eqs(dev); err_stop_eqs: @@ -1274,7 +1212,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, mlx5_eswitch_detach(dev->priv.eswitch); #endif mlx5_cleanup_fs(dev); - mlx5_irq_clear_affinity_hints(dev); free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_put_uars_page(dev, priv->uar); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5bac7f5..5797318 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -534,7 +534,6 @@ struct mlx5_core_sriov { }; struct mlx5_irq_info { - cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; }; @@ -1184,4 +1183,10 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; +static inline const struct cpumask * +mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector) +{ + return pci_irq_get_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + vector); +} + #endif /* MLX5_DRIVER_H */ -- cgit v1.1 From c66cd353bbe6869a059869a7a1518ec619afdc9d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:41 +0300 Subject: RDMA/core: expose affinity mappings per completion vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow ULPs to intelligently locate threads based on completion vector cpu affinity mappings. In case the driver does not expose a get_vector_affinity callout, return NULL so the caller can maintain a fallback logic. Reviewed-by: Christoph Hellwig Reviewed-by: HÃ¥kon Bugge Acked-by: Doug Ledford Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b573243..73ed2e4e8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2296,6 +2296,8 @@ struct ib_device { */ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); + const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, + int comp_vector); }; struct ib_client { @@ -3706,4 +3708,26 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, else return RDMA_AH_ATTR_TYPE_IB; } + +/** + * ib_get_vector_affinity - Get the affinity mappings of a given completion + * vector + * @device: the rdma device + * @comp_vector: index of completion vector + * + * Returns NULL on failure, otherwise a corresponding cpu map of the + * completion vector (returns all-cpus map if the device driver doesn't + * implement get_vector_affinity). + */ +static inline const struct cpumask * +ib_get_vector_affinity(struct ib_device *device, int comp_vector) +{ + if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || + !device->get_vector_affinity) + return NULL; + + return device->get_vector_affinity(device, comp_vector); + +} + #endif /* IB_VERBS_H */ -- cgit v1.1 From 40b24403f33ed8e9401b087e25f46d002fc8f396 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:42 +0300 Subject: mlx5: support ->get_vector_affinity Simply refer to the generic affinity mask helper. Reviewed-by: Christoph Hellwig Acked-by: Leon Romanovsky Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a7f2e60..a6d3bcf 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3556,6 +3556,14 @@ mlx5_ib_alloc_rdma_netdev(struct ib_device *hca, return netdev; } +const struct cpumask *mlx5_ib_get_vector_affinity(struct ib_device *ibdev, + int comp_vector) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + return mlx5_get_vector_affinity(dev->mdev, comp_vector); +} + static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; @@ -3686,6 +3694,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; dev->ib_dev.get_dev_fw_str = get_dev_fw_str; + dev->ib_dev.get_vector_affinity = mlx5_ib_get_vector_affinity; if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) dev->ib_dev.alloc_rdma_netdev = mlx5_ib_alloc_rdma_netdev; -- cgit v1.1 From 24c5dc6610e8a3764fcb885cc3284c12ff1513de Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:43 +0300 Subject: block: Add rdma affinity based queue mapping helper Like pci and virtio, we add a rdma helper for affinity spreading. This achieves optimal mq affinity assignments according to the underlying rdma device affinity maps. Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- block/Kconfig | 5 +++++ block/Makefile | 1 + block/blk-mq-rdma.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq-rdma.h | 10 +++++++++ 4 files changed, 68 insertions(+) create mode 100644 block/blk-mq-rdma.c create mode 100644 include/linux/blk-mq-rdma.h diff --git a/block/Kconfig b/block/Kconfig index 89cd28f..3ab42bb 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -206,4 +206,9 @@ config BLK_MQ_VIRTIO depends on BLOCK && VIRTIO default y +config BLK_MQ_RDMA + bool + depends on BLOCK && INFINIBAND + default y + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 2b281cf2..9396ebc 100644 --- a/block/Makefile +++ b/block/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o +obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c new file mode 100644 index 0000000..996167f --- /dev/null +++ b/block/blk-mq-rdma.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 Sagi Grimberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include + +/** + * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device + * @set: tagset to provide the mapping for + * @dev: rdma device associated with @set. + * @first_vec: first interrupt vectors to use for queues (usually 0) + * + * This function assumes the rdma device @dev has at least as many available + * interrupt vetors as @set has queues. It will then query it's affinity mask + * and built queue mapping that maps a queue to the CPUs that have irq affinity + * for the corresponding vector. + * + * In case either the driver passed a @dev with less vectors than + * @set->nr_hw_queues, or @dev does not provide an affinity mask for a + * vector, we fallback to the naive mapping. + */ +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = ib_get_vector_affinity(dev, first_vec + queue); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; + +fallback: + return blk_mq_map_queues(set); +} +EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h new file mode 100644 index 0000000..b4ade19 --- /dev/null +++ b/include/linux/blk-mq-rdma.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_BLK_MQ_RDMA_H +#define _LINUX_BLK_MQ_RDMA_H + +struct blk_mq_tag_set; +struct ib_device; + +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec); + +#endif /* _LINUX_BLK_MQ_RDMA_H */ -- cgit v1.1 From 0b36658ca8b15b55eca2806ce7d5542ec26ee686 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:44 +0300 Subject: nvme-rdma: use intelligent affinity based queue mappings Use the generic block layer affinity mapping helper. Also, limit nr_hw_queues to the rdma device number of irq vectors as we don't really need more. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/nvme/host/rdma.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index da04df1..4e25acc 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -463,14 +464,10 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) ibdev = queue->device->dev; /* - * The admin queue is barely used once the controller is live, so don't - * bother to spread it out. + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. */ - if (idx == 0) - comp_vector = 0; - else - comp_vector = idx % ibdev->num_comp_vectors; - + comp_vector = idx == 0 ? idx : idx - 1; /* +1 for ib_stop_cq */ queue->ib_cq = ib_alloc_cq(ibdev, queue, @@ -611,10 +608,20 @@ out_free_queues: static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + struct ib_device *ibdev = ctrl->device->dev; unsigned int nr_io_queues; int i, ret; nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); + + /* + * we map queues according to the device irq vectors for + * optimal locality so we don't need more queues than + * completion vectors. + */ + nr_io_queues = min_t(unsigned int, nr_io_queues, + ibdev->num_comp_vectors); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) return ret; @@ -1498,6 +1505,13 @@ static void nvme_rdma_complete_rq(struct request *rq) nvme_complete_rq(rq); } +static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_rdma_ctrl *ctrl = set->driver_data; + + return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0); +} + static const struct blk_mq_ops nvme_rdma_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, @@ -1507,6 +1521,7 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { .init_hctx = nvme_rdma_init_hctx, .poll = nvme_rdma_poll, .timeout = nvme_rdma_timeout, + .map_queues = nvme_rdma_map_queues, }; static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { -- cgit v1.1