From 9ba6d5529dd919b442eedf5bef1dd28aca2ee9fe Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 12 Apr 2007 18:10:25 +0300 Subject: IB/mthca: Work around kernel QP starvation With mthca, RC QPs can starve each other and even UD QPs on the same hardware schedule queue. As a result, userspace MPI can starve e.g. IPoIB traffic, with netdev watchdog warnings getting printed out, and TCP connections getting stuck or failing. Reduce the chance of this happening by using three separate hardware schedule queues: one for userspace RC QPs, one for kernel RC QPs, and one for all other QPs. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_qp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/infiniband/hw/mthca') diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 8fe6fee..fee60c8 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -701,6 +701,19 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); } + if (ibqp->qp_type == IB_QPT_RC && + cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + u8 sched_queue = ibqp->uobject ? 0x2 : 0x1; + + if (mthca_is_memfree(dev)) + qp_context->rlkey_arbel_sched_queue |= sched_queue; + else + qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue); + + qp_param->opt_param_mask |= + cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE); + } + if (attr_mask & IB_QP_TIMEOUT) { qp_context->pri_path.ackto = attr->timeout << 3; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT); -- cgit v1.1 From f4fd0b224d60044d2da5ca02f8f2b5150c1d8731 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 3 May 2007 13:48:47 +0300 Subject: IB: Add CQ comp_vector support Add a num_comp_vectors member to struct ib_device and extend ib_create_cq() to pass in a comp_vector parameter -- this parallels the userspace libibverbs API. Update all hardware drivers to set num_comp_vectors to 1 and have all ULPs pass 0 for the comp_vector value. Pass the value of num_comp_vectors to userspace rather than hard-coding a value of 1. We want multiple CQ event vector support (via MSI-X or similar for adapters that can generate multiple interrupts), but it's not clear how many vectors we want, or how we want to deal with policy issues such as how to decide which vector to use or how to set up interrupt affinity. This patch is useful for experimenting, since no core changes will be necessary when updating a driver to support multiple vectors, and we know that we want to make at least these changes anyway. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_provider.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/infiniband/hw/mthca') diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 47e6fd46d..1c05486 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -663,6 +663,7 @@ static int mthca_destroy_qp(struct ib_qp *qp) } static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, struct ib_ucontext *context, struct ib_udata *udata) { @@ -1292,6 +1293,7 @@ int mthca_register_device(struct mthca_dev *dev) (1ull << IB_USER_VERBS_CMD_DETACH_MCAST); dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.phys_port_cnt = dev->limits.num_ports; + dev->ib_dev.num_comp_vectors = 1; dev->ib_dev.dma_device = &dev->pdev->dev; dev->ib_dev.query_device = mthca_query_device; dev->ib_dev.query_port = mthca_query_port; -- cgit v1.1 From ed23a72778f3dbd465e55b06fe31629e7e1dd2f3 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 6 May 2007 21:02:48 -0700 Subject: IB: Return "maybe missed event" hint from ib_req_notify_cq() The semantics defined by the InfiniBand specification say that completion events are only generated when a completions is added to a completion queue (CQ) after completion notification is requested. In other words, this means that the following race is possible: while (CQ is not empty) ib_poll_cq(CQ); // new completion is added after while loop is exited ib_req_notify_cq(CQ); // no event is generated for the existing completion To close this race, the IB spec recommends doing another poll of the CQ after requesting notification. However, it is not always possible to arrange code this way (for example, we have found that NAPI for IPoIB cannot poll after requesting notification). Also, some hardware (eg Mellanox HCAs) actually will generate an event for completions added before the call to ib_req_notify_cq() -- which is allowed by the spec, since there's no way for any upper-layer consumer to know exactly when a completion was really added -- so the extra poll of the CQ is just a waste. Motivated by this, we add a new flag "IB_CQ_REPORT_MISSED_EVENTS" for ib_req_notify_cq() so that it can return a hint about whether the a completion may have been added before the request for notification. The return value of ib_req_notify_cq() is extended so: < 0 means an error occurred while requesting notification == 0 means notification was requested successfully, and if IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events were missed and it is safe to wait for another event. > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed in. It means that the consumer must poll the CQ again to make sure it is empty to avoid the race described above. We add a flag to enable this behavior rather than turning it on unconditionally, because checking for missed events may incur significant overhead for some low-level drivers, and consumers that don't care about the results of this test shouldn't be forced to pay for the test. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cq.c | 12 +++++++----- drivers/infiniband/hw/mthca/mthca_dev.h | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw/mthca') diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index efd79ef..cf0868f 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -726,11 +726,12 @@ repoll: return err == 0 || err == -EAGAIN ? npolled : err; } -int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify) +int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { __be32 doorbell[2]; - doorbell[0] = cpu_to_be32((notify == IB_CQ_SOLICITED ? + doorbell[0] = cpu_to_be32(((flags & IB_CQ_SOLICITED_MASK) == + IB_CQ_SOLICITED ? MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL : MTHCA_TAVOR_CQ_DB_REQ_NOT) | to_mcq(cq)->cqn); @@ -743,7 +744,7 @@ int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify) return 0; } -int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) +int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mthca_cq *cq = to_mcq(ibcq); __be32 doorbell[2]; @@ -755,7 +756,8 @@ int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) doorbell[0] = ci; doorbell[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) | - (notify == IB_CQ_SOLICITED ? 1 : 2)); + ((flags & IB_CQ_SOLICITED_MASK) == + IB_CQ_SOLICITED ? 1 : 2)); mthca_write_db_rec(doorbell, cq->arm_db); @@ -766,7 +768,7 @@ int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) wmb(); doorbell[0] = cpu_to_be32((sn << 28) | - (notify == IB_CQ_SOLICITED ? + ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL : MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn); diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index b7e42ef..9bae3cc 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -495,8 +495,8 @@ void mthca_unmap_eq_icm(struct mthca_dev *dev); int mthca_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); -int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify); -int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify); +int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); int mthca_init_cq(struct mthca_dev *dev, int nent, struct mthca_ucontext *ctx, u32 pdn, struct mthca_cq *cq); -- cgit v1.1