summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/configfs-rdma_cm22
-rw-r--r--Documentation/ABI/testing/sysfs-class-infiniband16
-rw-r--r--Documentation/infiniband/core_locking.txt2
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt2
-rw-r--r--MAINTAINERS32
-rw-r--r--block/Makefile2
-rw-r--r--drivers/infiniband/Kconfig10
-rw-r--r--drivers/infiniband/core/Makefile4
-rw-r--r--drivers/infiniband/core/addr.c194
-rw-r--r--drivers/infiniband/core/cache.c345
-rw-r--r--drivers/infiniband/core/cm.c49
-rw-r--r--drivers/infiniband/core/cma.c265
-rw-r--r--drivers/infiniband/core/cma_configfs.c321
-rw-r--r--drivers/infiniband/core/core_priv.h45
-rw-r--r--drivers/infiniband/core/cq.c209
-rw-r--r--drivers/infiniband/core/device.c51
-rw-r--r--drivers/infiniband/core/fmr_pool.c20
-rw-r--r--drivers/infiniband/core/mad.c162
-rw-r--r--drivers/infiniband/core/mad_priv.h2
-rw-r--r--drivers/infiniband/core/multicast.c17
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c81
-rw-r--r--drivers/infiniband/core/sa_query.c91
-rw-r--r--drivers/infiniband/core/sysfs.c377
-rw-r--r--drivers/infiniband/core/ud_header.c155
-rw-r--r--drivers/infiniband/core/umem_odp.c2
-rw-r--r--drivers/infiniband/core/user_mad.c1
-rw-r--r--drivers/infiniband/core/uverbs.h2
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c38
-rw-r--r--drivers/infiniband/core/uverbs_main.c13
-rw-r--r--drivers/infiniband/core/uverbs_marshall.c1
-rw-r--r--drivers/infiniband/core/verbs.c238
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_cm.c4
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_cq.c4
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_mem.c102
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.c146
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.h15
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_qp.c82
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c14
-rw-r--r--drivers/infiniband/hw/cxgb4/cq.c3
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c57
-rw-r--r--drivers/infiniband/hw/cxgb4/iw_cxgb4.h13
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c251
-rw-r--r--drivers/infiniband/hw/cxgb4/provider.c3
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c5
-rw-r--r--drivers/infiniband/hw/cxgb4/t4.h7
-rw-r--r--drivers/infiniband/hw/cxgb4/user.h2
-rw-r--r--drivers/infiniband/hw/mlx4/ah.c3
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c3
-rw-r--r--drivers/infiniband/hw/mlx4/main.c102
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h10
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c22
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c318
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c3
-rw-r--r--drivers/infiniband/hw/mlx5/ah.c32
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c31
-rw-r--r--drivers/infiniband/hw/mlx5/main.c447
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h121
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c29
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c1014
-rw-r--r--drivers/infiniband/hw/mlx5/srq.c41
-rw-r--r--drivers/infiniband/hw/mlx5/user.h63
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cq.c3
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c84
-rw-r--r--drivers/infiniband/hw/mthca/mthca_qp.c2
-rw-r--r--drivers/infiniband/hw/nes/nes_cm.c17
-rw-r--r--drivers/infiniband/hw/nes/nes_cm.h2
-rw-r--r--drivers/infiniband/hw/nes/nes_utils.c2
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c216
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.h4
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_ah.c7
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_main.c1
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.c163
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.h3
-rw-r--r--drivers/infiniband/hw/qib/qib_mr.c51
-rw-r--r--drivers/infiniband/hw/qib/qib_qp.c46
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.c12
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.h4
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs_mcast.c35
-rw-r--r--drivers/infiniband/hw/usnic/usnic_debugfs.c5
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c4
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_verbs.c24
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_verbs.h2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_vnic.c54
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h6
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c21
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ethtool.c14
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c40
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c45
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c13
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h156
-rw-r--r--drivers/infiniband/ulp/iser/iser_initiator.c323
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c179
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c337
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c118
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.h41
-rw-r--r--drivers/infiniband/ulp/isert/isert_proto.h47
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c205
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h7
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.c443
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.h53
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/fw.c39
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/mlx4.h5
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/port.c7
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/qp.c26
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eq.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c61
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/qp.c233
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/srq.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/transobj.c50
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/vport.c116
-rw-r--r--drivers/scsi/Kconfig1
-rw-r--r--drivers/scsi/be2iscsi/Kconfig1
-rw-r--r--drivers/scsi/be2iscsi/be.h4
-rw-r--r--drivers/scsi/be2iscsi/be_iscsi.c4
-rw-r--r--drivers/scsi/be2iscsi/be_main.c20
-rw-r--r--drivers/scsi/ipr.c25
-rw-r--r--drivers/scsi/ipr.h4
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c21
-rw-r--r--drivers/staging/rdma/amso1100/c2_cq.c3
-rw-r--r--drivers/staging/rdma/amso1100/c2_provider.c72
-rw-r--r--drivers/staging/rdma/ehca/ehca_classes.h5
-rw-r--r--drivers/staging/rdma/ehca/ehca_iverbs.h16
-rw-r--r--drivers/staging/rdma/ehca/ehca_main.c4
-rw-r--r--drivers/staging/rdma/ehca/ehca_mrmw.c477
-rw-r--r--drivers/staging/rdma/ehca/ehca_mrmw.h5
-rw-r--r--drivers/staging/rdma/ehca/ehca_reqs.c1
-rw-r--r--drivers/staging/rdma/hfi1/mr.c51
-rw-r--r--drivers/staging/rdma/hfi1/verbs.c1
-rw-r--r--drivers/staging/rdma/hfi1/verbs.h4
-rw-r--r--drivers/staging/rdma/ipath/ipath_mr.c55
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs.c1
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs.h4
-rw-r--r--include/linux/blk-iopoll.h46
-rw-r--r--include/linux/interrupt.h2
-rw-r--r--include/linux/irq_poll.h25
-rw-r--r--include/linux/mlx4/cmd.h3
-rw-r--r--include/linux/mlx4/device.h15
-rw-r--r--include/linux/mlx4/qp.h15
-rw-r--r--include/linux/mlx5/device.h40
-rw-r--r--include/linux/mlx5/driver.h20
-rw-r--r--include/linux/mlx5/mlx5_ifc.h48
-rw-r--r--include/linux/mlx5/qp.h46
-rw-r--r--include/linux/mlx5/transobj.h (renamed from drivers/net/ethernet/mellanox/mlx5/core/transobj.h)10
-rw-r--r--include/linux/mlx5/vport.h8
-rw-r--r--include/linux/sunrpc/svc_rdma.h39
-rw-r--r--include/rdma/ib_addr.h16
-rw-r--r--include/rdma/ib_cache.h4
-rw-r--r--include/rdma/ib_mad.h2
-rw-r--r--include/rdma/ib_pack.h45
-rw-r--r--include/rdma/ib_pma.h1
-rw-r--r--include/rdma/ib_sa.h3
-rw-r--r--include/rdma/ib_verbs.h356
-rw-r--r--include/scsi/iser.h78
-rw-r--r--include/trace/events/irq.h2
-rw-r--r--lib/Kconfig5
-rw-r--r--lib/Makefile1
-rw-r--r--lib/irq_poll.c (renamed from block/blk-iopoll.c)108
-rw-r--r--net/rds/ib.c34
-rw-r--r--net/rds/iw.c23
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c41
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c371
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c56
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c33
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c360
-rw-r--r--net/sunrpc/xprtrdma/transport.c30
-rw-r--r--net/sunrpc/xprtrdma/verbs.c24
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h21
-rw-r--r--tools/lib/traceevent/event-parse.c2
-rw-r--r--tools/perf/util/trace-event-parse.c2
174 files changed, 7110 insertions, 4669 deletions
diff --git a/Documentation/ABI/testing/configfs-rdma_cm b/Documentation/ABI/testing/configfs-rdma_cm
new file mode 100644
index 0000000..5c389aa
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-rdma_cm
@@ -0,0 +1,22 @@
+What: /config/rdma_cm
+Date: November 29, 2015
+KernelVersion: 4.4.0
+Description: Interface is used to configure RDMA-cable HCAs in respect to
+ RDMA-CM attributes.
+
+ Attributes are visible only when configfs is mounted. To mount
+ configfs in /config directory use:
+ # mount -t configfs none /config/
+
+ In order to set parameters related to a specific HCA, a directory
+ for this HCA has to be created:
+ mkdir -p /config/rdma_cm/<hca>
+
+
+What: /config/rdma_cm/<hca>/ports/<port-num>/default_roce_mode
+Date: November 29, 2015
+KernelVersion: 4.4.0
+Description: RDMA-CM based connections from HCA <hca> at port <port-num>
+ will be initiated with this RoCE type as default.
+ The possible RoCE types are either "IB/RoCE v1" or "RoCE v2".
+ This parameter has RW access.
diff --git a/Documentation/ABI/testing/sysfs-class-infiniband b/Documentation/ABI/testing/sysfs-class-infiniband
new file mode 100644
index 0000000..a86abe6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-infiniband
@@ -0,0 +1,16 @@
+What: /sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/ndevs/<gid-index>
+Date: November 29, 2015
+KernelVersion: 4.4.0
+Contact: linux-rdma@vger.kernel.org
+Description: The net-device's name associated with the GID resides
+ at index <gid-index>.
+
+What: /sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/types/<gid-index>
+Date: November 29, 2015
+KernelVersion: 4.4.0
+Contact: linux-rdma@vger.kernel.org
+Description: The RoCE type of the associated GID resides at index <gid-index>.
+ This could either be "IB/RoCE v1" for IB and RoCE v1 based GODs
+ or "RoCE v2" for RoCE v2 based GIDs.
+
+
diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt
index e167854..4b1f36b 100644
--- a/Documentation/infiniband/core_locking.txt
+++ b/Documentation/infiniband/core_locking.txt
@@ -15,7 +15,6 @@ Sleeping and interrupt context
modify_ah
query_ah
destroy_ah
- bind_mw
post_send
post_recv
poll_cq
@@ -31,7 +30,6 @@ Sleeping and interrupt context
ib_modify_ah
ib_query_ah
ib_destroy_ah
- ib_bind_mw
ib_post_send
ib_post_recv
ib_req_notify_cq
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index f4cbfe0..edec3a3 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -90,7 +90,7 @@ BLOCK_SOFTIRQ: Do all of the following:
from being initiated from tasks that might run on the CPU to
be de-jittered. (It is OK to force this CPU offline and then
bring it back online before you start your application.)
-BLOCK_IOPOLL_SOFTIRQ: Do all of the following:
+IRQ_POLL_SOFTIRQ: Do all of the following:
1. Force block-device interrupts onto some other CPU.
2. Initiate any block I/O and block-I/O polling on other CPUs.
3. Once your application has started, prevent CPU-hotplug operations
diff --git a/MAINTAINERS b/MAINTAINERS
index 8f3f93c..59373e5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7151,27 +7151,45 @@ W: https://linuxtv.org
S: Odd Fixes
F: drivers/media/radio/radio-miropcm20*
-Mellanox MLX5 core VPI driver
-M: Eli Cohen <eli@mellanox.com>
+MELLANOX MLX4 core VPI driver
+M: Yishai Hadas <yishaih@mellanox.com>
L: netdev@vger.kernel.org
L: linux-rdma@vger.kernel.org
W: http://www.mellanox.com
Q: http://patchwork.ozlabs.org/project/netdev/list/
+S: Supported
+F: drivers/net/ethernet/mellanox/mlx4/
+F: include/linux/mlx4/
+
+MELLANOX MLX4 IB driver
+M: Yishai Hadas <yishaih@mellanox.com>
+L: linux-rdma@vger.kernel.org
+W: http://www.mellanox.com
Q: http://patchwork.kernel.org/project/linux-rdma/list/
-T: git git://openfabrics.org/~eli/connect-ib.git
+S: Supported
+F: drivers/infiniband/hw/mlx4/
+F: include/linux/mlx4/
+
+MELLANOX MLX5 core VPI driver
+M: Matan Barak <matanb@mellanox.com>
+M: Leon Romanovsky <leonro@mellanox.com>
+L: netdev@vger.kernel.org
+L: linux-rdma@vger.kernel.org
+W: http://www.mellanox.com
+Q: http://patchwork.ozlabs.org/project/netdev/list/
S: Supported
F: drivers/net/ethernet/mellanox/mlx5/core/
F: include/linux/mlx5/
-Mellanox MLX5 IB driver
-M: Eli Cohen <eli@mellanox.com>
+MELLANOX MLX5 IB driver
+M: Matan Barak <matanb@mellanox.com>
+M: Leon Romanovsky <leonro@mellanox.com>
L: linux-rdma@vger.kernel.org
W: http://www.mellanox.com
Q: http://patchwork.kernel.org/project/linux-rdma/list/
-T: git git://openfabrics.org/~eli/connect-ib.git
S: Supported
-F: include/linux/mlx5/
F: drivers/infiniband/hw/mlx5/
+F: include/linux/mlx5/
MELEXIS MLX90614 DRIVER
M: Crt Mori <cmo@melexis.com>
diff --git a/block/Makefile b/block/Makefile
index db5f622..9eda232 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index aa26f3c..8a8440c 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -5,6 +5,7 @@ menuconfig INFINIBAND
depends on NET
depends on INET
depends on m || IPV6 != m
+ select IRQ_POLL
---help---
Core support for InfiniBand (IB). Make sure to also select
any protocols you wish to use as well as drivers for your
@@ -54,6 +55,15 @@ config INFINIBAND_ADDR_TRANS
depends on INFINIBAND
default y
+config INFINIBAND_ADDR_TRANS_CONFIGFS
+ bool
+ depends on INFINIBAND_ADDR_TRANS && CONFIGFS_FS && !(INFINIBAND=y && CONFIGFS_FS=m)
+ default y
+ ---help---
+ ConfigFS support for RDMA communication manager (CM).
+ This allows the user to config the default GID type that the CM
+ uses for each device, when initiaing new connections.
+
source "drivers/infiniband/hw/mthca/Kconfig"
source "drivers/infiniband/hw/qib/Kconfig"
source "drivers/infiniband/hw/cxgb3/Kconfig"
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a899..f818538 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
$(user_access-y)
-ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
+ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o \
roce_gid_mgmt.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
@@ -24,6 +24,8 @@ iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o
rdma_cm-y := cma.o
+rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
+
rdma_ucm-y := ucma.o
ib_addr-y := addr.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 34b1ada..337353d 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -121,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
}
EXPORT_SYMBOL(rdma_copy_addr);
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr,
u16 *vlan_id)
{
struct net_device *dev;
@@ -139,7 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
switch (addr->sa_family) {
case AF_INET:
dev = ip_dev_find(dev_addr->net,
- ((struct sockaddr_in *) addr)->sin_addr.s_addr);
+ ((const struct sockaddr_in *)addr)->sin_addr.s_addr);
if (!dev)
return ret;
@@ -154,7 +155,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
rcu_read_lock();
for_each_netdev_rcu(dev_addr->net, dev) {
if (ipv6_chk_addr(dev_addr->net,
- &((struct sockaddr_in6 *) addr)->sin6_addr,
+ &((const struct sockaddr_in6 *)addr)->sin6_addr,
dev, 1)) {
ret = rdma_copy_addr(dev_addr, dev, NULL);
if (vlan_id)
@@ -198,7 +199,8 @@ static void queue_req(struct addr_req *req)
mutex_unlock(&lock);
}
-static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr)
+static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+ const void *daddr)
{
struct neighbour *n;
int ret;
@@ -222,8 +224,9 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, v
}
static int addr4_resolve(struct sockaddr_in *src_in,
- struct sockaddr_in *dst_in,
- struct rdma_dev_addr *addr)
+ const struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr,
+ struct rtable **prt)
{
__be32 src_ip = src_in->sin_addr.s_addr;
__be32 dst_ip = dst_in->sin_addr.s_addr;
@@ -243,33 +246,29 @@ static int addr4_resolve(struct sockaddr_in *src_in,
src_in->sin_family = AF_INET;
src_in->sin_addr.s_addr = fl4.saddr;
- if (rt->dst.dev->flags & IFF_LOOPBACK) {
- ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
- if (!ret)
- memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
- goto put;
- }
+ /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+ * routable) and we could set the network type accordingly.
+ */
+ if (rt->rt_uses_gateway)
+ addr->network = RDMA_NETWORK_IPV4;
- /* If the device does ARP internally, return 'done' */
- if (rt->dst.dev->flags & IFF_NOARP) {
- ret = rdma_copy_addr(addr, rt->dst.dev, NULL);
- goto put;
- }
+ addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
- ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
-put:
- ip_rt_put(rt);
+ *prt = rt;
+ return 0;
out:
return ret;
}
#if IS_ENABLED(CONFIG_IPV6)
static int addr6_resolve(struct sockaddr_in6 *src_in,
- struct sockaddr_in6 *dst_in,
- struct rdma_dev_addr *addr)
+ const struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr,
+ struct dst_entry **pdst)
{
struct flowi6 fl6;
struct dst_entry *dst;
+ struct rt6_info *rt;
int ret;
memset(&fl6, 0, sizeof fl6);
@@ -281,6 +280,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
if ((ret = dst->error))
goto put;
+ rt = (struct rt6_info *)dst;
if (ipv6_addr_any(&fl6.saddr)) {
ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev,
&fl6.daddr, 0, &fl6.saddr);
@@ -291,43 +291,111 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
src_in->sin6_addr = fl6.saddr;
}
- if (dst->dev->flags & IFF_LOOPBACK) {
- ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
- if (!ret)
- memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
- goto put;
- }
+ /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+ * routable) and we could set the network type accordingly.
+ */
+ if (rt->rt6i_flags & RTF_GATEWAY)
+ addr->network = RDMA_NETWORK_IPV6;
- /* If the device does ARP internally, return 'done' */
- if (dst->dev->flags & IFF_NOARP) {
- ret = rdma_copy_addr(addr, dst->dev, NULL);
- goto put;
- }
+ addr->hoplimit = ip6_dst_hoplimit(dst);
- ret = dst_fetch_ha(dst, addr, &fl6.daddr);
+ *pdst = dst;
+ return 0;
put:
dst_release(dst);
return ret;
}
#else
static int addr6_resolve(struct sockaddr_in6 *src_in,
- struct sockaddr_in6 *dst_in,
- struct rdma_dev_addr *addr)
+ const struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr,
+ struct dst_entry **pdst)
{
return -EADDRNOTAVAIL;
}
#endif
+static int addr_resolve_neigh(struct dst_entry *dst,
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ if (dst->dev->flags & IFF_LOOPBACK) {
+ int ret;
+
+ ret = rdma_translate_ip(dst_in, addr, NULL);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+ MAX_ADDR_LEN);
+
+ return ret;
+ }
+
+ /* If the device doesn't do ARP internally */
+ if (!(dst->dev->flags & IFF_NOARP)) {
+ const struct sockaddr_in *dst_in4 =
+ (const struct sockaddr_in *)dst_in;
+ const struct sockaddr_in6 *dst_in6 =
+ (const struct sockaddr_in6 *)dst_in;
+
+ return dst_fetch_ha(dst, addr,
+ dst_in->sa_family == AF_INET ?
+ (const void *)&dst_in4->sin_addr.s_addr :
+ (const void *)&dst_in6->sin6_addr);
+ }
+
+ return rdma_copy_addr(addr, dst->dev, NULL);
+}
+
static int addr_resolve(struct sockaddr *src_in,
- struct sockaddr *dst_in,
- struct rdma_dev_addr *addr)
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr,
+ bool resolve_neigh)
{
+ struct net_device *ndev;
+ struct dst_entry *dst;
+ int ret;
+
if (src_in->sa_family == AF_INET) {
- return addr4_resolve((struct sockaddr_in *) src_in,
- (struct sockaddr_in *) dst_in, addr);
- } else
- return addr6_resolve((struct sockaddr_in6 *) src_in,
- (struct sockaddr_in6 *) dst_in, addr);
+ struct rtable *rt = NULL;
+ const struct sockaddr_in *dst_in4 =
+ (const struct sockaddr_in *)dst_in;
+
+ ret = addr4_resolve((struct sockaddr_in *)src_in,
+ dst_in4, addr, &rt);
+ if (ret)
+ return ret;
+
+ if (resolve_neigh)
+ ret = addr_resolve_neigh(&rt->dst, dst_in, addr);
+
+ ndev = rt->dst.dev;
+ dev_hold(ndev);
+
+ ip_rt_put(rt);
+ } else {
+ const struct sockaddr_in6 *dst_in6 =
+ (const struct sockaddr_in6 *)dst_in;
+
+ ret = addr6_resolve((struct sockaddr_in6 *)src_in,
+ dst_in6, addr,
+ &dst);
+ if (ret)
+ return ret;
+
+ if (resolve_neigh)
+ ret = addr_resolve_neigh(dst, dst_in, addr);
+
+ ndev = dst->dev;
+ dev_hold(ndev);
+
+ dst_release(dst);
+ }
+
+ addr->bound_dev_if = ndev->ifindex;
+ addr->net = dev_net(ndev);
+ dev_put(ndev);
+
+ return ret;
}
static void process_req(struct work_struct *work)
@@ -343,7 +411,8 @@ static void process_req(struct work_struct *work)
if (req->status == -ENODATA) {
src_in = (struct sockaddr *) &req->src_addr;
dst_in = (struct sockaddr *) &req->dst_addr;
- req->status = addr_resolve(src_in, dst_in, req->addr);
+ req->status = addr_resolve(src_in, dst_in, req->addr,
+ true);
if (req->status && time_after_eq(jiffies, req->timeout))
req->status = -ETIMEDOUT;
else if (req->status == -ENODATA)
@@ -403,7 +472,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
req->client = client;
atomic_inc(&client->refcount);
- req->status = addr_resolve(src_in, dst_in, addr);
+ req->status = addr_resolve(src_in, dst_in, addr, true);
switch (req->status) {
case 0:
req->timeout = jiffies;
@@ -425,6 +494,26 @@ err:
}
EXPORT_SYMBOL(rdma_resolve_ip);
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr)
+{
+ struct sockaddr_storage ssrc_addr = {};
+ struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family)
+ return -EINVAL;
+
+ memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ return addr_resolve(src_in, dst_addr, addr, false);
+}
+EXPORT_SYMBOL(rdma_resolve_ip_route);
+
void rdma_addr_cancel(struct rdma_dev_addr *addr)
{
struct addr_req *req, *temp_req;
@@ -456,8 +545,10 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
complete(&((struct resolve_cb_context *)context)->comp);
}
-int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid,
- u8 *dmac, u16 *vlan_id, int if_index)
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *dmac, u16 *vlan_id, int *if_index,
+ int *hoplimit)
{
int ret = 0;
struct rdma_dev_addr dev_addr;
@@ -475,7 +566,8 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi
rdma_gid2ip(&dgid_addr._sockaddr, dgid);
memset(&dev_addr, 0, sizeof(dev_addr));
- dev_addr.bound_dev_if = if_index;
+ if (if_index)
+ dev_addr.bound_dev_if = *if_index;
dev_addr.net = &init_net;
ctx.addr = &dev_addr;
@@ -491,12 +583,16 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi
dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
if (!dev)
return -ENODEV;
+ if (if_index)
+ *if_index = dev_addr.bound_dev_if;
if (vlan_id)
*vlan_id = rdma_vlan_dev_vlan_id(dev);
+ if (hoplimit)
+ *hoplimit = dev_addr.hoplimit;
dev_put(dev);
return ret;
}
-EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
+EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh);
int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
{
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 89bebea..53343ff 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -64,6 +64,7 @@ enum gid_attr_find_mask {
GID_ATTR_FIND_MASK_GID = 1UL << 0,
GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
+ GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3,
};
enum gid_table_entry_props {
@@ -81,10 +82,6 @@ enum gid_table_write_action {
};
struct ib_gid_table_entry {
- /* This lock protects an entry from being
- * read and written simultaneously.
- */
- rwlock_t lock;
unsigned long props;
union ib_gid gid;
struct ib_gid_attr attr;
@@ -109,28 +106,86 @@ struct ib_gid_table {
* are locked by this lock.
**/
struct mutex lock;
+ /* This lock protects the table entries from being
+ * read and written simultaneously.
+ */
+ rwlock_t rwlock;
struct ib_gid_table_entry *data_vec;
};
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
+{
+ if (rdma_cap_roce_gid_table(ib_dev, port)) {
+ struct ib_event event;
+
+ event.device = ib_dev;
+ event.element.port_num = port;
+ event.event = IB_EVENT_GID_CHANGE;
+
+ ib_dispatch_event(&event);
+ }
+}
+
+static const char * const gid_type_str[] = {
+ [IB_GID_TYPE_IB] = "IB/RoCE v1",
+ [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2",
+};
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
+{
+ if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+ return gid_type_str[gid_type];
+
+ return "Invalid GID type";
+}
+EXPORT_SYMBOL(ib_cache_gid_type_str);
+
+int ib_cache_gid_parse_type_str(const char *buf)
+{
+ unsigned int i;
+ size_t len;
+ int err = -EINVAL;
+
+ len = strlen(buf);
+ if (len == 0)
+ return -EINVAL;
+
+ if (buf[len - 1] == '\n')
+ len--;
+
+ for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+ if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
+ len == strlen(gid_type_str[i])) {
+ err = i;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
+
+/* This function expects that rwlock will be write locked in all
+ * scenarios and that lock will be locked in sleep-able (RoCE)
+ * scenarios.
+ */
static int write_gid(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table, int ix,
const union ib_gid *gid,
const struct ib_gid_attr *attr,
enum gid_table_write_action action,
bool default_gid)
+ __releases(&table->rwlock) __acquires(&table->rwlock)
{
int ret = 0;
struct net_device *old_net_dev;
- unsigned long flags;
/* in rdma_cap_roce_gid_table, this funciton should be protected by a
* sleep-able lock.
*/
- write_lock_irqsave(&table->data_vec[ix].lock, flags);
if (rdma_cap_roce_gid_table(ib_dev, port)) {
table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
- write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+ write_unlock_irq(&table->rwlock);
/* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
* RoCE providers and thus only updates the cache.
*/
@@ -140,7 +195,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
else if (action == GID_TABLE_WRITE_ACTION_DEL)
ret = ib_dev->del_gid(ib_dev, port, ix,
&table->data_vec[ix].context);
- write_lock_irqsave(&table->data_vec[ix].lock, flags);
+ write_lock_irq(&table->rwlock);
}
old_net_dev = table->data_vec[ix].attr.ndev;
@@ -162,17 +217,6 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
- write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
-
- if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
- struct ib_event event;
-
- event.device = ib_dev;
- event.element.port_num = port;
- event.event = IB_EVENT_GID_CHANGE;
-
- ib_dispatch_event(&event);
- }
return ret;
}
@@ -201,41 +245,58 @@ static int del_gid(struct ib_device *ib_dev, u8 port,
GID_TABLE_WRITE_ACTION_DEL, default_gid);
}
+/* rwlock should be read locked */
static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
const struct ib_gid_attr *val, bool default_gid,
- unsigned long mask)
+ unsigned long mask, int *pempty)
{
- int i;
+ int i = 0;
+ int found = -1;
+ int empty = pempty ? -1 : 0;
- for (i = 0; i < table->sz; i++) {
- unsigned long flags;
- struct ib_gid_attr *attr = &table->data_vec[i].attr;
+ while (i < table->sz && (found < 0 || empty < 0)) {
+ struct ib_gid_table_entry *data = &table->data_vec[i];
+ struct ib_gid_attr *attr = &data->attr;
+ int curr_index = i;
- read_lock_irqsave(&table->data_vec[i].lock, flags);
+ i++;
- if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
- goto next;
+ if (data->props & GID_TABLE_ENTRY_INVALID)
+ continue;
+
+ if (empty < 0)
+ if (!memcmp(&data->gid, &zgid, sizeof(*gid)) &&
+ !memcmp(attr, &zattr, sizeof(*attr)) &&
+ !data->props)
+ empty = curr_index;
+
+ if (found >= 0)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+ attr->gid_type != val->gid_type)
+ continue;
if (mask & GID_ATTR_FIND_MASK_GID &&
- memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
- goto next;
+ memcmp(gid, &data->gid, sizeof(*gid)))
+ continue;
if (mask & GID_ATTR_FIND_MASK_NETDEV &&
attr->ndev != val->ndev)
- goto next;
+ continue;
if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
- !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+ !!(data->props & GID_TABLE_ENTRY_DEFAULT) !=
default_gid)
- goto next;
+ continue;
- read_unlock_irqrestore(&table->data_vec[i].lock, flags);
- return i;
-next:
- read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+ found = curr_index;
}
- return -1;
+ if (pempty)
+ *pempty = empty;
+
+ return found;
}
static void make_default_gid(struct net_device *dev, union ib_gid *gid)
@@ -252,6 +313,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
int ix;
int ret = 0;
struct net_device *idev;
+ int empty;
table = ports_table[port - rdma_start_port(ib_dev)];
@@ -275,22 +337,25 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
}
mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
- GID_ATTR_FIND_MASK_NETDEV);
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_NETDEV, &empty);
if (ix >= 0)
goto out_unlock;
- ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
- GID_ATTR_FIND_MASK_DEFAULT);
- if (ix < 0) {
+ if (empty < 0) {
ret = -ENOSPC;
goto out_unlock;
}
- add_gid(ib_dev, port, table, ix, gid, attr, false);
+ ret = add_gid(ib_dev, port, table, empty, gid, attr, false);
+ if (!ret)
+ dispatch_gid_change_event(ib_dev, port);
out_unlock:
+ write_unlock_irq(&table->rwlock);
mutex_unlock(&table->lock);
return ret;
}
@@ -305,17 +370,22 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
table = ports_table[port - rdma_start_port(ib_dev)];
mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
ix = find_gid(table, gid, attr, false,
GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
GID_ATTR_FIND_MASK_NETDEV |
- GID_ATTR_FIND_MASK_DEFAULT);
+ GID_ATTR_FIND_MASK_DEFAULT,
+ NULL);
if (ix < 0)
goto out_unlock;
- del_gid(ib_dev, port, table, ix, false);
+ if (!del_gid(ib_dev, port, table, ix, false))
+ dispatch_gid_change_event(ib_dev, port);
out_unlock:
+ write_unlock_irq(&table->rwlock);
mutex_unlock(&table->lock);
return 0;
}
@@ -326,16 +396,24 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
struct ib_gid_table *table;
int ix;
+ bool deleted = false;
table = ports_table[port - rdma_start_port(ib_dev)];
mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
for (ix = 0; ix < table->sz; ix++)
if (table->data_vec[ix].attr.ndev == ndev)
- del_gid(ib_dev, port, table, ix, false);
+ if (!del_gid(ib_dev, port, table, ix, false))
+ deleted = true;
+ write_unlock_irq(&table->rwlock);
mutex_unlock(&table->lock);
+
+ if (deleted)
+ dispatch_gid_change_event(ib_dev, port);
+
return 0;
}
@@ -344,18 +422,14 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
{
struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
struct ib_gid_table *table;
- unsigned long flags;
table = ports_table[port - rdma_start_port(ib_dev)];
if (index < 0 || index >= table->sz)
return -EINVAL;
- read_lock_irqsave(&table->data_vec[index].lock, flags);
- if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
- read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+ if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID)
return -EAGAIN;
- }
memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
if (attr) {
@@ -364,7 +438,6 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
dev_hold(attr->ndev);
}
- read_unlock_irqrestore(&table->data_vec[index].lock, flags);
return 0;
}
@@ -378,17 +451,21 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
struct ib_gid_table *table;
u8 p;
int local_index;
+ unsigned long flags;
for (p = 0; p < ib_dev->phys_port_cnt; p++) {
table = ports_table[p];
- local_index = find_gid(table, gid, val, false, mask);
+ read_lock_irqsave(&table->rwlock, flags);
+ local_index = find_gid(table, gid, val, false, mask, NULL);
if (local_index >= 0) {
if (index)
*index = local_index;
if (port)
*port = p + rdma_start_port(ib_dev);
+ read_unlock_irqrestore(&table->rwlock, flags);
return 0;
}
+ read_unlock_irqrestore(&table->rwlock, flags);
}
return -ENOENT;
@@ -396,11 +473,13 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
static int ib_cache_gid_find(struct ib_device *ib_dev,
const union ib_gid *gid,
+ enum ib_gid_type gid_type,
struct net_device *ndev, u8 *port,
u16 *index)
{
- unsigned long mask = GID_ATTR_FIND_MASK_GID;
- struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
if (ndev)
mask |= GID_ATTR_FIND_MASK_NETDEV;
@@ -411,14 +490,17 @@ static int ib_cache_gid_find(struct ib_device *ib_dev,
int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
const union ib_gid *gid,
+ enum ib_gid_type gid_type,
u8 port, struct net_device *ndev,
u16 *index)
{
int local_index;
struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
struct ib_gid_table *table;
- unsigned long mask = GID_ATTR_FIND_MASK_GID;
- struct ib_gid_attr val = {.ndev = ndev};
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
+ unsigned long flags;
if (port < rdma_start_port(ib_dev) ||
port > rdma_end_port(ib_dev))
@@ -429,13 +511,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
if (ndev)
mask |= GID_ATTR_FIND_MASK_NETDEV;
- local_index = find_gid(table, gid, &val, false, mask);
+ read_lock_irqsave(&table->rwlock, flags);
+ local_index = find_gid(table, gid, &val, false, mask, NULL);
if (local_index >= 0) {
if (index)
*index = local_index;
+ read_unlock_irqrestore(&table->rwlock, flags);
return 0;
}
+ read_unlock_irqrestore(&table->rwlock, flags);
return -ENOENT;
}
EXPORT_SYMBOL(ib_find_cached_gid_by_port);
@@ -472,6 +557,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
struct ib_gid_table *table;
unsigned int i;
+ unsigned long flags;
bool found = false;
if (!ports_table)
@@ -484,11 +570,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
table = ports_table[port - rdma_start_port(ib_dev)];
+ read_lock_irqsave(&table->rwlock, flags);
for (i = 0; i < table->sz; i++) {
struct ib_gid_attr attr;
- unsigned long flags;
- read_lock_irqsave(&table->data_vec[i].lock, flags);
if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
goto next;
@@ -501,11 +586,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
found = true;
next:
- read_unlock_irqrestore(&table->data_vec[i].lock, flags);
-
if (found)
break;
}
+ read_unlock_irqrestore(&table->rwlock, flags);
if (!found)
return -ENOENT;
@@ -517,9 +601,9 @@ next:
static struct ib_gid_table *alloc_gid_table(int sz)
{
- unsigned int i;
struct ib_gid_table *table =
kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+
if (!table)
return NULL;
@@ -530,9 +614,7 @@ static struct ib_gid_table *alloc_gid_table(int sz)
mutex_init(&table->lock);
table->sz = sz;
-
- for (i = 0; i < sz; i++)
- rwlock_init(&table->data_vec[i].lock);
+ rwlock_init(&table->rwlock);
return table;
@@ -553,30 +635,37 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table)
{
int i;
+ bool deleted = false;
if (!table)
return;
+ write_lock_irq(&table->rwlock);
for (i = 0; i < table->sz; ++i) {
if (memcmp(&table->data_vec[i].gid, &zgid,
sizeof(table->data_vec[i].gid)))
- del_gid(ib_dev, port, table, i,
- table->data_vec[i].props &
- GID_ATTR_FIND_MASK_DEFAULT);
+ if (!del_gid(ib_dev, port, table, i,
+ table->data_vec[i].props &
+ GID_ATTR_FIND_MASK_DEFAULT))
+ deleted = true;
}
+ write_unlock_irq(&table->rwlock);
+
+ if (deleted)
+ dispatch_gid_change_event(ib_dev, port);
}
void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
struct net_device *ndev,
+ unsigned long gid_type_mask,
enum ib_cache_gid_default_mode mode)
{
struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
union ib_gid gid;
struct ib_gid_attr gid_attr;
+ struct ib_gid_attr zattr_type = zattr;
struct ib_gid_table *table;
- int ix;
- union ib_gid current_gid;
- struct ib_gid_attr current_gid_attr = {};
+ unsigned int gid_type;
table = ports_table[port - rdma_start_port(ib_dev)];
@@ -584,46 +673,82 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
memset(&gid_attr, 0, sizeof(gid_attr));
gid_attr.ndev = ndev;
- mutex_lock(&table->lock);
- ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
-
- /* Coudn't find default GID location */
- WARN_ON(ix < 0);
-
- if (!__ib_cache_gid_get(ib_dev, port, ix,
- &current_gid, &current_gid_attr) &&
- mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
- !memcmp(&gid, &current_gid, sizeof(gid)) &&
- !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
- goto unlock;
-
- if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
- memcmp(&current_gid_attr, &zattr,
- sizeof(current_gid_attr))) &&
- del_gid(ib_dev, port, table, ix, true)) {
- pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
- ix, gid.raw);
- goto unlock;
- }
+ for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+ int ix;
+ union ib_gid current_gid;
+ struct ib_gid_attr current_gid_attr = {};
+
+ if (1UL << gid_type & ~gid_type_mask)
+ continue;
+
+ gid_attr.gid_type = gid_type;
+
+ mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
+ ix = find_gid(table, NULL, &gid_attr, true,
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_DEFAULT,
+ NULL);
+
+ /* Coudn't find default GID location */
+ WARN_ON(ix < 0);
+
+ zattr_type.gid_type = gid_type;
+
+ if (!__ib_cache_gid_get(ib_dev, port, ix,
+ &current_gid, &current_gid_attr) &&
+ mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+ !memcmp(&gid, &current_gid, sizeof(gid)) &&
+ !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+ goto release;
+
+ if (memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+ memcmp(&current_gid_attr, &zattr_type,
+ sizeof(current_gid_attr))) {
+ if (del_gid(ib_dev, port, table, ix, true)) {
+ pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+ ix, gid.raw);
+ goto release;
+ } else {
+ dispatch_gid_change_event(ib_dev, port);
+ }
+ }
- if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
- if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
- pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
- gid.raw);
+ if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
+ if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+ pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+ gid.raw);
+ else
+ dispatch_gid_change_event(ib_dev, port);
+ }
-unlock:
- if (current_gid_attr.ndev)
- dev_put(current_gid_attr.ndev);
- mutex_unlock(&table->lock);
+release:
+ if (current_gid_attr.ndev)
+ dev_put(current_gid_attr.ndev);
+ write_unlock_irq(&table->rwlock);
+ mutex_unlock(&table->lock);
+ }
}
static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table)
{
- if (rdma_protocol_roce(ib_dev, port)) {
- struct ib_gid_table_entry *entry = &table->data_vec[0];
+ unsigned int i;
+ unsigned long roce_gid_type_mask;
+ unsigned int num_default_gids;
+ unsigned int current_gid = 0;
+
+ roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+ num_default_gids = hweight_long(roce_gid_type_mask);
+ for (i = 0; i < num_default_gids && i < table->sz; i++) {
+ struct ib_gid_table_entry *entry =
+ &table->data_vec[i];
entry->props |= GID_TABLE_ENTRY_DEFAULT;
+ current_gid = find_next_bit(&roce_gid_type_mask,
+ BITS_PER_LONG,
+ current_gid);
+ entry->attr.gid_type = current_gid++;
}
return 0;
@@ -728,20 +853,30 @@ int ib_get_cached_gid(struct ib_device *device,
union ib_gid *gid,
struct ib_gid_attr *gid_attr)
{
+ int res;
+ unsigned long flags;
+ struct ib_gid_table **ports_table = device->cache.gid_cache;
+ struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)];
+
if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
- return __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+ read_lock_irqsave(&table->rwlock, flags);
+ res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+ read_unlock_irqrestore(&table->rwlock, flags);
+
+ return res;
}
EXPORT_SYMBOL(ib_get_cached_gid);
int ib_find_cached_gid(struct ib_device *device,
const union ib_gid *gid,
+ enum ib_gid_type gid_type,
struct net_device *ndev,
u8 *port_num,
u16 *index)
{
- return ib_cache_gid_find(device, gid, ndev, port_num, index);
+ return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index);
}
EXPORT_SYMBOL(ib_find_cached_gid);
@@ -956,10 +1091,12 @@ static void ib_cache_update(struct ib_device *device,
device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
if (!use_roce_gid_table) {
+ write_lock(&table->rwlock);
for (i = 0; i < gid_cache->table_len; i++) {
modify_gid(device, port, table, i, gid_cache->table + i,
&zattr, false);
}
+ write_unlock(&table->rwlock);
}
device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 0a26dd6..1d92e09 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -364,7 +364,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
read_lock_irqsave(&cm.device_lock, flags);
list_for_each_entry(cm_dev, &cm.device_list, list) {
if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
- ndev, &p, NULL)) {
+ path->gid_type, ndev, &p, NULL)) {
port = cm_dev->port[p-1];
break;
}
@@ -782,11 +782,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
/* Check if the device started its remove_one */
- spin_lock_irq(&cm.lock);
+ spin_lock_irqsave(&cm.lock, flags);
if (!cm_dev->going_down)
queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
msecs_to_jiffies(wait_time));
- spin_unlock_irq(&cm.lock);
+ spin_unlock_irqrestore(&cm.lock, flags);
cm_id_priv->timewait_info = NULL;
}
@@ -1600,6 +1600,8 @@ static int cm_req_handler(struct cm_work *work)
struct ib_cm_id *cm_id;
struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
struct cm_req_msg *req_msg;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
int ret;
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
@@ -1639,11 +1641,31 @@ static int cm_req_handler(struct cm_work *work)
cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
- ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+ work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit;
+ ret = ib_get_cached_gid(work->port->cm_dev->ib_device,
+ work->port->port_num,
+ cm_id_priv->av.ah_attr.grh.sgid_index,
+ &gid, &gid_attr);
+ if (!ret) {
+ if (gid_attr.ndev) {
+ work->path[0].ifindex = gid_attr.ndev->ifindex;
+ work->path[0].net = dev_net(gid_attr.ndev);
+ dev_put(gid_attr.ndev);
+ }
+ work->path[0].gid_type = gid_attr.gid_type;
+ ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+ }
if (ret) {
- ib_get_cached_gid(work->port->cm_dev->ib_device,
- work->port->port_num, 0, &work->path[0].sgid,
- NULL);
+ int err = ib_get_cached_gid(work->port->cm_dev->ib_device,
+ work->port->port_num, 0,
+ &work->path[0].sgid,
+ &gid_attr);
+ if (!err && gid_attr.ndev) {
+ work->path[0].ifindex = gid_attr.ndev->ifindex;
+ work->path[0].net = dev_net(gid_attr.ndev);
+ dev_put(gid_attr.ndev);
+ }
+ work->path[0].gid_type = gid_attr.gid_type;
ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
&work->path[0].sgid, sizeof work->path[0].sgid,
NULL, 0);
@@ -3482,6 +3504,7 @@ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
EXPORT_SYMBOL(ib_cm_notify);
static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct cm_port *port = mad_agent->context;
@@ -3731,16 +3754,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
}
EXPORT_SYMBOL(ib_cm_init_qp_attr);
-static void cm_get_ack_delay(struct cm_device *cm_dev)
-{
- struct ib_device_attr attr;
-
- if (ib_query_device(cm_dev->ib_device, &attr))
- cm_dev->ack_delay = 0; /* acks will rely on packet life time */
- else
- cm_dev->ack_delay = attr.local_ca_ack_delay;
-}
-
static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
char *buf)
{
@@ -3852,7 +3865,7 @@ static void cm_add_one(struct ib_device *ib_device)
return;
cm_dev->ib_device = ib_device;
- cm_get_ack_delay(cm_dev);
+ cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
cm_dev->device = device_create(&cm_class, &ib_device->dev,
MKDEV(0, 0), NULL,
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 2d762a2..9729639 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -38,6 +38,7 @@
#include <linux/in6.h>
#include <linux/mutex.h>
#include <linux/random.h>
+#include <linux/igmp.h>
#include <linux/idr.h>
#include <linux/inetdevice.h>
#include <linux/slab.h>
@@ -60,6 +61,8 @@
#include <rdma/ib_sa.h>
#include <rdma/iw_cm.h>
+#include "core_priv.h"
+
MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("Generic RDMA CM Agent");
MODULE_LICENSE("Dual BSD/GPL");
@@ -150,6 +153,7 @@ struct cma_device {
struct completion comp;
atomic_t refcount;
struct list_head id_list;
+ enum ib_gid_type *default_gid_type;
};
struct rdma_bind_list {
@@ -185,6 +189,67 @@ enum {
CMA_OPTION_AFONLY,
};
+void cma_ref_dev(struct cma_device *cma_dev)
+{
+ atomic_inc(&cma_dev->refcount);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie)
+{
+ struct cma_device *cma_dev;
+ struct cma_device *found_cma_dev = NULL;
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(cma_dev, &dev_list, list)
+ if (filter(cma_dev->device, cookie)) {
+ found_cma_dev = cma_dev;
+ break;
+ }
+
+ if (found_cma_dev)
+ cma_ref_dev(found_cma_dev);
+ mutex_unlock(&lock);
+ return found_cma_dev;
+}
+
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port)
+{
+ if (port < rdma_start_port(cma_dev->device) ||
+ port > rdma_end_port(cma_dev->device))
+ return -EINVAL;
+
+ return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port,
+ enum ib_gid_type default_gid_type)
+{
+ unsigned long supported_gids;
+
+ if (port < rdma_start_port(cma_dev->device) ||
+ port > rdma_end_port(cma_dev->device))
+ return -EINVAL;
+
+ supported_gids = roce_gid_type_mask_support(cma_dev->device, port);
+
+ if (!(supported_gids & 1 << default_gid_type))
+ return -EINVAL;
+
+ cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
+ default_gid_type;
+
+ return 0;
+}
+
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
+{
+ return cma_dev->device;
+}
+
/*
* Device removal can occur at anytime, so we need extra handling to
* serialize notifying the user of device removal with other callbacks.
@@ -228,6 +293,7 @@ struct rdma_id_private {
u8 tos;
u8 reuseaddr;
u8 afonly;
+ enum ib_gid_type gid_type;
};
struct cma_multicast {
@@ -239,6 +305,7 @@ struct cma_multicast {
void *context;
struct sockaddr_storage addr;
struct kref mcref;
+ bool igmp_joined;
};
struct cma_work {
@@ -335,18 +402,48 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
}
-static void cma_attach_to_dev(struct rdma_id_private *id_priv,
- struct cma_device *cma_dev)
+static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
{
- atomic_inc(&cma_dev->refcount);
+ struct in_device *in_dev = NULL;
+
+ if (ndev) {
+ rtnl_lock();
+ in_dev = __in_dev_get_rtnl(ndev);
+ if (in_dev) {
+ if (join)
+ ip_mc_inc_group(in_dev,
+ *(__be32 *)(mgid->raw + 12));
+ else
+ ip_mc_dec_group(in_dev,
+ *(__be32 *)(mgid->raw + 12));
+ }
+ rtnl_unlock();
+ }
+ return (in_dev) ? 0 : -ENODEV;
+}
+
+static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ cma_ref_dev(cma_dev);
id_priv->cma_dev = cma_dev;
+ id_priv->gid_type = 0;
id_priv->id.device = cma_dev->device;
id_priv->id.route.addr.dev_addr.transport =
rdma_node_get_transport(cma_dev->device->node_type);
list_add_tail(&id_priv->list, &cma_dev->id_list);
}
-static inline void cma_deref_dev(struct cma_device *cma_dev)
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ _cma_attach_to_dev(id_priv, cma_dev);
+ id_priv->gid_type =
+ cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(cma_dev->device)];
+}
+
+void cma_deref_dev(struct cma_device *cma_dev)
{
if (atomic_dec_and_test(&cma_dev->refcount))
complete(&cma_dev->comp);
@@ -441,6 +538,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
}
static inline int cma_validate_port(struct ib_device *device, u8 port,
+ enum ib_gid_type gid_type,
union ib_gid *gid, int dev_type,
int bound_if_index)
{
@@ -453,10 +551,25 @@ static inline int cma_validate_port(struct ib_device *device, u8 port,
if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
return ret;
- if (dev_type == ARPHRD_ETHER)
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
ndev = dev_get_by_index(&init_net, bound_if_index);
+ if (ndev && ndev->flags & IFF_LOOPBACK) {
+ pr_info("detected loopback device\n");
+ dev_put(ndev);
- ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL);
+ if (!device->get_netdev)
+ return -EOPNOTSUPP;
+
+ ndev = device->get_netdev(device, port);
+ if (!ndev)
+ return -ENODEV;
+ }
+ } else {
+ gid_type = IB_GID_TYPE_IB;
+ }
+
+ ret = ib_find_cached_gid_by_port(device, gid, gid_type, port,
+ ndev, NULL);
if (ndev)
dev_put(ndev);
@@ -490,7 +603,10 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
gidp = rdma_protocol_roce(cma_dev->device, port) ?
&iboe_gid : &gid;
- ret = cma_validate_port(cma_dev->device, port, gidp,
+ ret = cma_validate_port(cma_dev->device, port,
+ rdma_protocol_ib(cma_dev->device, port) ?
+ IB_GID_TYPE_IB :
+ listen_id_priv->gid_type, gidp,
dev_addr->dev_type,
dev_addr->bound_dev_if);
if (!ret) {
@@ -509,8 +625,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
gidp = rdma_protocol_roce(cma_dev->device, port) ?
&iboe_gid : &gid;
- ret = cma_validate_port(cma_dev->device, port, gidp,
- dev_addr->dev_type,
+ ret = cma_validate_port(cma_dev->device, port,
+ rdma_protocol_ib(cma_dev->device, port) ?
+ IB_GID_TYPE_IB :
+ cma_dev->default_gid_type[port - 1],
+ gidp, dev_addr->dev_type,
dev_addr->bound_dev_if);
if (!ret) {
id_priv->id.port_num = port;
@@ -1437,8 +1556,24 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
id_priv->id.port_num)) {
ib_sa_free_multicast(mc->multicast.ib);
kfree(mc);
- } else
+ } else {
+ if (mc->igmp_joined) {
+ struct rdma_dev_addr *dev_addr =
+ &id_priv->id.route.addr.dev_addr;
+ struct net_device *ndev = NULL;
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(&init_net,
+ dev_addr->bound_dev_if);
+ if (ndev) {
+ cma_igmp_send(ndev,
+ &mc->multicast.ib->rec.mgid,
+ false);
+ dev_put(ndev);
+ }
+ }
kref_put(&mc->mcref, release_mc);
+ }
}
}
@@ -1896,7 +2031,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
int ret;
- struct ib_device_attr attr;
struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
@@ -1938,13 +2072,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
- ret = ib_query_device(conn_id->id.device, &attr);
- if (ret) {
- mutex_unlock(&conn_id->handler_mutex);
- rdma_destroy_id(new_cm_id);
- goto out;
- }
-
memset(&event, 0, sizeof event);
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
event.param.conn.private_data = iw_event->private_data;
@@ -2051,7 +2178,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
rdma_addr_size(cma_src_addr(id_priv)));
- cma_attach_to_dev(dev_id_priv, cma_dev);
+ _cma_attach_to_dev(dev_id_priv, cma_dev);
list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
atomic_inc(&id_priv->refcount);
dev_id_priv->internal_id = 1;
@@ -2321,8 +2448,23 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
if (addr->dev_addr.bound_dev_if) {
ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+ if (!ndev)
+ return -ENODEV;
+
+ if (ndev->flags & IFF_LOOPBACK) {
+ dev_put(ndev);
+ if (!id_priv->id.device->get_netdev)
+ return -EOPNOTSUPP;
+
+ ndev = id_priv->id.device->get_netdev(id_priv->id.device,
+ id_priv->id.port_num);
+ if (!ndev)
+ return -ENODEV;
+ }
+
route->path_rec->net = &init_net;
- route->path_rec->ifindex = addr->dev_addr.bound_dev_if;
+ route->path_rec->ifindex = ndev->ifindex;
+ route->path_rec->gid_type = id_priv->gid_type;
}
if (!ndev) {
ret = -ENODEV;
@@ -2336,7 +2478,14 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
&route->path_rec->dgid);
- route->path_rec->hop_limit = 1;
+ /* Use the hint from IP Stack to select GID Type */
+ if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+ route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+ if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
+ /* TODO: get the hoplimit from the inet/inet6 device */
+ route->path_rec->hop_limit = addr->dev_addr.hoplimit;
+ else
+ route->path_rec->hop_limit = 1;
route->path_rec->reversible = 1;
route->path_rec->pkey = cpu_to_be16(0xffff);
route->path_rec->mtu_selector = IB_SA_EQ;
@@ -3534,12 +3683,23 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
event.status = status;
event.param.ud.private_data = mc->context;
if (!status) {
+ struct rdma_dev_addr *dev_addr =
+ &id_priv->id.route.addr.dev_addr;
+ struct net_device *ndev =
+ dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ enum ib_gid_type gid_type =
+ id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+
event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
ib_init_ah_from_mcmember(id_priv->id.device,
id_priv->id.port_num, &multicast->rec,
+ ndev, gid_type,
&event.param.ud.ah_attr);
event.param.ud.qp_num = 0xFFFFFF;
event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+ if (ndev)
+ dev_put(ndev);
} else
event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
@@ -3672,9 +3832,10 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
{
struct iboe_mcast_work *work;
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
- int err;
+ int err = 0;
struct sockaddr *addr = (struct sockaddr *)&mc->addr;
struct net_device *ndev = NULL;
+ enum ib_gid_type gid_type;
if (cma_zero_addr((struct sockaddr *)&mc->addr))
return -EINVAL;
@@ -3704,9 +3865,25 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
mc->multicast.ib->rec.hop_limit = 1;
mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+
+ gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+ if (addr->sa_family == AF_INET) {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+ err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+ true);
+ if (!err) {
+ mc->igmp_joined = true;
+ mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+ }
+ } else {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+ err = -ENOTSUPP;
+ }
dev_put(ndev);
- if (!mc->multicast.ib->rec.mtu) {
- err = -EINVAL;
+ if (err || !mc->multicast.ib->rec.mtu) {
+ if (!err)
+ err = -EINVAL;
goto out2;
}
rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
@@ -3745,7 +3922,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
memcpy(&mc->addr, addr, rdma_addr_size(addr));
mc->context = context;
mc->id_priv = id_priv;
-
+ mc->igmp_joined = false;
spin_lock(&id_priv->lock);
list_add(&mc->list, &id_priv->mc_list);
spin_unlock(&id_priv->lock);
@@ -3790,9 +3967,25 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
if (rdma_cap_ib_mcast(id->device, id->port_num)) {
ib_sa_free_multicast(mc->multicast.ib);
kfree(mc);
- } else if (rdma_protocol_roce(id->device, id->port_num))
+ } else if (rdma_protocol_roce(id->device, id->port_num)) {
+ if (mc->igmp_joined) {
+ struct rdma_dev_addr *dev_addr =
+ &id->route.addr.dev_addr;
+ struct net_device *ndev = NULL;
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(&init_net,
+ dev_addr->bound_dev_if);
+ if (ndev) {
+ cma_igmp_send(ndev,
+ &mc->multicast.ib->rec.mgid,
+ false);
+ dev_put(ndev);
+ }
+ mc->igmp_joined = false;
+ }
kref_put(&mc->mcref, release_mc);
-
+ }
return;
}
}
@@ -3861,12 +4054,27 @@ static void cma_add_one(struct ib_device *device)
{
struct cma_device *cma_dev;
struct rdma_id_private *id_priv;
+ unsigned int i;
+ unsigned long supported_gids = 0;
cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
if (!cma_dev)
return;
cma_dev->device = device;
+ cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
+ sizeof(*cma_dev->default_gid_type),
+ GFP_KERNEL);
+ if (!cma_dev->default_gid_type) {
+ kfree(cma_dev);
+ return;
+ }
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ supported_gids = roce_gid_type_mask_support(device, i);
+ WARN_ON(!supported_gids);
+ cma_dev->default_gid_type[i - rdma_start_port(device)] =
+ find_first_bit(&supported_gids, BITS_PER_LONG);
+ }
init_completion(&cma_dev->comp);
atomic_set(&cma_dev->refcount, 1);
@@ -3946,6 +4154,7 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
mutex_unlock(&lock);
cma_process_remove(cma_dev);
+ kfree(cma_dev->default_gid_type);
kfree(cma_dev);
}
@@ -4079,6 +4288,7 @@ static int __init cma_init(void)
if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+ cma_configfs_init();
return 0;
@@ -4093,6 +4303,7 @@ err_wq:
static void __exit cma_cleanup(void)
{
+ cma_configfs_exit();
ibnl_remove_client(RDMA_NL_RDMA_CM);
ib_unregister_client(&cma_client);
unregister_netdevice_notifier(&cma_nb);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
new file mode 100644
index 0000000..18b112a
--- /dev/null
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <rdma/ib_verbs.h>
+#include "core_priv.h"
+
+struct cma_device;
+
+struct cma_dev_group;
+
+struct cma_dev_port_group {
+ unsigned int port_num;
+ struct cma_dev_group *cma_dev_group;
+ struct config_group group;
+};
+
+struct cma_dev_group {
+ char name[IB_DEVICE_NAME_MAX];
+ struct config_group device_group;
+ struct config_group ports_group;
+ struct config_group *default_dev_group[2];
+ struct config_group **default_ports_group;
+ struct cma_dev_port_group *ports;
+};
+
+static struct cma_dev_port_group *to_dev_port_group(struct config_item *item)
+{
+ struct config_group *group;
+
+ if (!item)
+ return NULL;
+
+ group = container_of(item, struct config_group, cg_item);
+ return container_of(group, struct cma_dev_port_group, group);
+}
+
+static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
+{
+ return !strcmp(ib_dev->name, cookie);
+}
+
+static int cma_configfs_params_get(struct config_item *item,
+ struct cma_device **pcma_dev,
+ struct cma_dev_port_group **pgroup)
+{
+ struct cma_dev_port_group *group = to_dev_port_group(item);
+ struct cma_device *cma_dev;
+
+ if (!group)
+ return -ENODEV;
+
+ cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+ group->cma_dev_group->name);
+ if (!cma_dev)
+ return -ENODEV;
+
+ *pcma_dev = cma_dev;
+ *pgroup = group;
+
+ return 0;
+}
+
+static void cma_configfs_params_put(struct cma_device *cma_dev)
+{
+ cma_deref_dev(cma_dev);
+}
+
+static ssize_t default_roce_mode_show(struct config_item *item,
+ char *buf)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ int gid_type;
+ ssize_t ret;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ gid_type = cma_get_default_gid_type(cma_dev, group->port_num);
+ cma_configfs_params_put(cma_dev);
+
+ if (gid_type < 0)
+ return gid_type;
+
+ return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type));
+}
+
+static ssize_t default_roce_mode_store(struct config_item *item,
+ const char *buf, size_t count)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ int gid_type = ib_cache_gid_parse_type_str(buf);
+ ssize_t ret;
+
+ if (gid_type < 0)
+ return -EINVAL;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type);
+
+ cma_configfs_params_put(cma_dev);
+
+ return !ret ? strnlen(buf, count) : ret;
+}
+
+CONFIGFS_ATTR(, default_roce_mode);
+
+static struct configfs_attribute *cma_configfs_attributes[] = {
+ &attr_default_roce_mode,
+ NULL,
+};
+
+static struct config_item_type cma_port_group_type = {
+ .ct_attrs = cma_configfs_attributes,
+ .ct_owner = THIS_MODULE
+};
+
+static int make_cma_ports(struct cma_dev_group *cma_dev_group,
+ struct cma_device *cma_dev)
+{
+ struct ib_device *ibdev;
+ unsigned int i;
+ unsigned int ports_num;
+ struct cma_dev_port_group *ports;
+ struct config_group **ports_group;
+ int err;
+
+ ibdev = cma_get_ib_dev(cma_dev);
+
+ if (!ibdev)
+ return -ENODEV;
+
+ ports_num = ibdev->phys_port_cnt;
+ ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports),
+ GFP_KERNEL);
+ ports_group = kcalloc(ports_num + 1, sizeof(*ports_group), GFP_KERNEL);
+
+ if (!ports || !ports_group) {
+ err = -ENOMEM;
+ goto free;
+ }
+
+ for (i = 0; i < ports_num; i++) {
+ char port_str[10];
+
+ ports[i].port_num = i + 1;
+ snprintf(port_str, sizeof(port_str), "%u", i + 1);
+ ports[i].cma_dev_group = cma_dev_group;
+ config_group_init_type_name(&ports[i].group,
+ port_str,
+ &cma_port_group_type);
+ ports_group[i] = &ports[i].group;
+ }
+ ports_group[i] = NULL;
+ cma_dev_group->default_ports_group = ports_group;
+ cma_dev_group->ports = ports;
+
+ return 0;
+free:
+ kfree(ports);
+ kfree(ports_group);
+ cma_dev_group->ports = NULL;
+ cma_dev_group->default_ports_group = NULL;
+ return err;
+}
+
+static void release_cma_dev(struct config_item *item)
+{
+ struct config_group *group = container_of(item, struct config_group,
+ cg_item);
+ struct cma_dev_group *cma_dev_group = container_of(group,
+ struct cma_dev_group,
+ device_group);
+
+ kfree(cma_dev_group);
+};
+
+static void release_cma_ports_group(struct config_item *item)
+{
+ struct config_group *group = container_of(item, struct config_group,
+ cg_item);
+ struct cma_dev_group *cma_dev_group = container_of(group,
+ struct cma_dev_group,
+ ports_group);
+
+ kfree(cma_dev_group->ports);
+ kfree(cma_dev_group->default_ports_group);
+ cma_dev_group->ports = NULL;
+ cma_dev_group->default_ports_group = NULL;
+};
+
+static struct configfs_item_operations cma_ports_item_ops = {
+ .release = release_cma_ports_group
+};
+
+static struct config_item_type cma_ports_group_type = {
+ .ct_item_ops = &cma_ports_item_ops,
+ .ct_owner = THIS_MODULE
+};
+
+static struct configfs_item_operations cma_device_item_ops = {
+ .release = release_cma_dev
+};
+
+static struct config_item_type cma_device_group_type = {
+ .ct_item_ops = &cma_device_item_ops,
+ .ct_owner = THIS_MODULE
+};
+
+static struct config_group *make_cma_dev(struct config_group *group,
+ const char *name)
+{
+ int err = -ENODEV;
+ struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+ (void *)name);
+ struct cma_dev_group *cma_dev_group = NULL;
+
+ if (!cma_dev)
+ goto fail;
+
+ cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL);
+
+ if (!cma_dev_group) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+
+ err = make_cma_ports(cma_dev_group, cma_dev);
+ if (err)
+ goto fail;
+
+ cma_dev_group->ports_group.default_groups =
+ cma_dev_group->default_ports_group;
+ config_group_init_type_name(&cma_dev_group->ports_group, "ports",
+ &cma_ports_group_type);
+
+ cma_dev_group->device_group.default_groups
+ = cma_dev_group->default_dev_group;
+ cma_dev_group->default_dev_group[0] = &cma_dev_group->ports_group;
+ cma_dev_group->default_dev_group[1] = NULL;
+
+ config_group_init_type_name(&cma_dev_group->device_group, name,
+ &cma_device_group_type);
+
+ cma_deref_dev(cma_dev);
+ return &cma_dev_group->device_group;
+
+fail:
+ if (cma_dev)
+ cma_deref_dev(cma_dev);
+ kfree(cma_dev_group);
+ return ERR_PTR(err);
+}
+
+static struct configfs_group_operations cma_subsys_group_ops = {
+ .make_group = make_cma_dev,
+};
+
+static struct config_item_type cma_subsys_type = {
+ .ct_group_ops = &cma_subsys_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem cma_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "rdma_cm",
+ .ci_type = &cma_subsys_type,
+ },
+ },
+};
+
+int __init cma_configfs_init(void)
+{
+ config_group_init(&cma_subsys.su_group);
+ mutex_init(&cma_subsys.su_mutex);
+ return configfs_register_subsystem(&cma_subsys);
+}
+
+void __exit cma_configfs_exit(void)
+{
+ configfs_unregister_subsystem(&cma_subsys);
+}
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 5cf6eb7..eab3221 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -38,6 +38,32 @@
#include <rdma/ib_verbs.h>
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+ return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+struct cma_device;
+void cma_ref_dev(struct cma_device *cma_dev);
+void cma_deref_dev(struct cma_device *cma_dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie);
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port);
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port,
+ enum ib_gid_type default_gid_type);
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
+
int ib_device_register_sysfs(struct ib_device *device,
int (*port_callback)(struct ib_device *,
u8, struct kobject *));
@@ -70,8 +96,13 @@ enum ib_cache_gid_default_mode {
IB_CACHE_GID_DEFAULT_MODE_DELETE
};
+int ib_cache_gid_parse_type_str(const char *buf);
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
+
void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
struct net_device *ndev,
+ unsigned long gid_type_mask,
enum ib_cache_gid_default_mode mode);
int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
@@ -87,9 +118,23 @@ int roce_gid_mgmt_init(void);
void roce_gid_mgmt_cleanup(void);
int roce_rescan_device(struct ib_device *ib_dev);
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
int ib_cache_setup_one(struct ib_device *device);
void ib_cache_cleanup_one(struct ib_device *device);
void ib_cache_release_one(struct ib_device *device);
+static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ struct net_device *_upper = NULL;
+ struct list_head *iter;
+
+ netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+ if (_upper == upper)
+ break;
+
+ return _upper == upper;
+}
+
#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644
index 0000000..a754fc7
--- /dev/null
+++ b/drivers/infiniband/core/cq.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH 16
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ 256
+#define IB_POLL_BUDGET_WORKQUEUE 65536
+
+#define IB_POLL_FLAGS \
+ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget)
+{
+ int i, n, completed = 0;
+
+ while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
+ for (i = 0; i < n; i++) {
+ struct ib_wc *wc = &cq->wc[i];
+
+ if (wc->wr_cqe)
+ wc->wr_cqe->done(cq, wc);
+ else
+ WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+ }
+
+ completed += n;
+
+ if (n != IB_POLL_BATCH ||
+ (budget != -1 && completed >= budget))
+ break;
+ }
+
+ return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq: CQ to process
+ * @budget: number of CQEs to poll for
+ *
+ * This function is used to process all outstanding CQ entries on a
+ * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different
+ * context and does not ask for completion interrupts from the HCA.
+ *
+ * Note: for compatibility reasons -1 can be passed in %budget for unlimited
+ * polling. Do not use this feature in new code, it will be removed soon.
+ */
+int ib_process_cq_direct(struct ib_cq *cq, int budget)
+{
+ WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
+
+ return __ib_process_cq(cq, budget);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+ WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+ struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+ int completed;
+
+ completed = __ib_process_cq(cq, budget);
+ if (completed < budget) {
+ irq_poll_complete(&cq->iop);
+ if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+ irq_poll_sched(&cq->iop);
+ }
+
+ return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+ irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+ struct ib_cq *cq = container_of(work, struct ib_cq, work);
+ int completed;
+
+ completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
+ if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+ ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+ queue_work(ib_comp_wq, &cq->work);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+ queue_work(ib_comp_wq, &cq->work);
+}
+
+/**
+ * ib_alloc_cq - allocate a completion queue
+ * @dev: device to allocate the CQ for
+ * @private: driver private data, accessible from cq->cq_context
+ * @nr_cqe: number of CQEs to allocate
+ * @comp_vector: HCA completion vectors for this CQ
+ * @poll_ctx: context to poll the CQ from.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context. The ULP needs must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+ int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
+{
+ struct ib_cq_init_attr cq_attr = {
+ .cqe = nr_cqe,
+ .comp_vector = comp_vector,
+ };
+ struct ib_cq *cq;
+ int ret = -ENOMEM;
+
+ cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+ if (IS_ERR(cq))
+ return cq;
+
+ cq->device = dev;
+ cq->uobject = NULL;
+ cq->event_handler = NULL;
+ cq->cq_context = private;
+ cq->poll_ctx = poll_ctx;
+ atomic_set(&cq->usecnt, 0);
+
+ cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+ if (!cq->wc)
+ goto out_destroy_cq;
+
+ switch (cq->poll_ctx) {
+ case IB_POLL_DIRECT:
+ cq->comp_handler = ib_cq_completion_direct;
+ break;
+ case IB_POLL_SOFTIRQ:
+ cq->comp_handler = ib_cq_completion_softirq;
+
+ irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ break;
+ case IB_POLL_WORKQUEUE:
+ cq->comp_handler = ib_cq_completion_workqueue;
+ INIT_WORK(&cq->work, ib_cq_poll_work);
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out_free_wc;
+ }
+
+ return cq;
+
+out_free_wc:
+ kfree(cq->wc);
+out_destroy_cq:
+ cq->device->destroy_cq(cq);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_cq);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq: completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+ return;
+
+ switch (cq->poll_ctx) {
+ case IB_POLL_DIRECT:
+ break;
+ case IB_POLL_SOFTIRQ:
+ irq_poll_disable(&cq->iop);
+ break;
+ case IB_POLL_WORKQUEUE:
+ flush_work(&cq->work);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ kfree(cq->wc);
+ ret = cq->device->destroy_cq(cq);
+ WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL(ib_free_cq);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 179e813..00da80e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -58,6 +58,7 @@ struct ib_client_data {
bool going_down;
};
+struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
@@ -325,6 +326,7 @@ int ib_register_device(struct ib_device *device,
{
int ret;
struct ib_client *client;
+ struct ib_udata uhw = {.outlen = 0, .inlen = 0};
mutex_lock(&device_mutex);
@@ -352,6 +354,13 @@ int ib_register_device(struct ib_device *device,
goto out;
}
+ memset(&device->attrs, 0, sizeof(device->attrs));
+ ret = device->query_device(device, &device->attrs, &uhw);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't query the device attributes\n");
+ goto out;
+ }
+
ret = ib_device_register_sysfs(device, port_callback);
if (ret) {
printk(KERN_WARNING "Couldn't register device %s with driver model\n",
@@ -628,25 +637,6 @@ void ib_dispatch_event(struct ib_event *event)
EXPORT_SYMBOL(ib_dispatch_event);
/**
- * ib_query_device - Query IB device attributes
- * @device:Device to query
- * @device_attr:Device attributes
- *
- * ib_query_device() returns the attributes of a device through the
- * @device_attr pointer.
- */
-int ib_query_device(struct ib_device *device,
- struct ib_device_attr *device_attr)
-{
- struct ib_udata uhw = {.outlen = 0, .inlen = 0};
-
- memset(device_attr, 0, sizeof(*device_attr));
-
- return device->query_device(device, device_attr, &uhw);
-}
-EXPORT_SYMBOL(ib_query_device);
-
-/**
* ib_query_port - Query IB port attributes
* @device:Device to query
* @port_num:Port number to query
@@ -825,26 +815,31 @@ EXPORT_SYMBOL(ib_modify_port);
* a specified GID value occurs.
* @device: The device to query.
* @gid: The GID value to search for.
+ * @gid_type: Type of GID.
* @ndev: The ndev related to the GID to search for.
* @port_num: The port number of the device where the GID value was found.
* @index: The index into the GID table where the GID was found. This
* parameter may be NULL.
*/
int ib_find_gid(struct ib_device *device, union ib_gid *gid,
- struct net_device *ndev, u8 *port_num, u16 *index)
+ enum ib_gid_type gid_type, struct net_device *ndev,
+ u8 *port_num, u16 *index)
{
union ib_gid tmp_gid;
int ret, port, i;
for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
if (rdma_cap_roce_gid_table(device, port)) {
- if (!ib_find_cached_gid_by_port(device, gid, port,
+ if (!ib_find_cached_gid_by_port(device, gid, gid_type, port,
ndev, index)) {
*port_num = port;
return 0;
}
}
+ if (gid_type != IB_GID_TYPE_IB)
+ continue;
+
for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
if (ret)
@@ -954,10 +949,18 @@ static int __init ib_core_init(void)
if (!ib_wq)
return -ENOMEM;
+ ib_comp_wq = alloc_workqueue("ib-comp-wq",
+ WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
+ WQ_UNBOUND_MAX_ACTIVE);
+ if (!ib_comp_wq) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
ret = class_register(&ib_class);
if (ret) {
printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
- goto err;
+ goto err_comp;
}
ret = ibnl_init();
@@ -972,7 +975,8 @@ static int __init ib_core_init(void)
err_sysfs:
class_unregister(&ib_class);
-
+err_comp:
+ destroy_workqueue(ib_comp_wq);
err:
destroy_workqueue(ib_wq);
return ret;
@@ -983,6 +987,7 @@ static void __exit ib_core_cleanup(void)
ib_cache_cleanup();
ibnl_cleanup();
class_unregister(&ib_class);
+ destroy_workqueue(ib_comp_wq);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
}
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index 9f5ad7c..6ac3683 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -212,7 +212,6 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
{
struct ib_device *device;
struct ib_fmr_pool *pool;
- struct ib_device_attr *attr;
int i;
int ret;
int max_remaps;
@@ -228,25 +227,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
return ERR_PTR(-ENOSYS);
}
- attr = kmalloc(sizeof *attr, GFP_KERNEL);
- if (!attr) {
- printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
- return ERR_PTR(-ENOMEM);
- }
-
- ret = ib_query_device(device, attr);
- if (ret) {
- printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
- kfree(attr);
- return ERR_PTR(ret);
- }
-
- if (!attr->max_map_per_fmr)
+ if (!device->attrs.max_map_per_fmr)
max_remaps = IB_FMR_MAX_REMAPS;
else
- max_remaps = attr->max_map_per_fmr;
-
- kfree(attr);
+ max_remaps = device->attrs.max_map_per_fmr;
pool = kmalloc(sizeof *pool, GFP_KERNEL);
if (!pool) {
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 2281de1..9fa5bf3 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -84,6 +84,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
u8 mgmt_class);
static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
struct ib_mad_agent_private *agent_priv);
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc);
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
/*
* Returns a ib_mad_port_private structure or NULL for a device/port
@@ -681,7 +684,7 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
atomic_inc(&mad_snoop_priv->refcount);
spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
- mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+ mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL,
mad_recv_wc);
deref_snoop_agent(mad_snoop_priv);
spin_lock_irqsave(&qp_info->snoop_lock, flags);
@@ -689,12 +692,11 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
}
-static void build_smp_wc(struct ib_qp *qp,
- u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
- struct ib_wc *wc)
+static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
+ u16 pkey_index, u8 port_num, struct ib_wc *wc)
{
memset(wc, 0, sizeof *wc);
- wc->wr_id = wr_id;
+ wc->wr_cqe = cqe;
wc->status = IB_WC_SUCCESS;
wc->opcode = IB_WC_RECV;
wc->pkey_index = pkey_index;
@@ -832,7 +834,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
}
build_smp_wc(mad_agent_priv->agent.qp,
- send_wr->wr.wr_id, drslid,
+ send_wr->wr.wr_cqe, drslid,
send_wr->pkey_index,
send_wr->port_num, &mad_wc);
@@ -1039,7 +1041,9 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
- mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr;
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
mad_send_wr->send_wr.wr.num_sge = 2;
mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
@@ -1151,8 +1155,9 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
/* Set WR ID to find mad_send_wr upon completion */
qp_info = mad_send_wr->mad_agent_priv->qp_info;
- mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
mad_agent = mad_send_wr->send_buf.mad_agent;
sge = mad_send_wr->sg_list;
@@ -1982,9 +1987,9 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
/* user rmpp is in effect
* and this is an active RMPP MAD
*/
- mad_recv_wc->wc->wr_id = 0;
- mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
- mad_recv_wc);
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent, NULL,
+ mad_recv_wc);
atomic_dec(&mad_agent_priv->refcount);
} else {
/* not user rmpp, revert to normal behavior and
@@ -1998,9 +2003,10 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
/* Defined behavior is to complete response before request */
- mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
- mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
- mad_recv_wc);
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent,
+ &mad_send_wr->send_buf,
+ mad_recv_wc);
atomic_dec(&mad_agent_priv->refcount);
mad_send_wc.status = IB_WC_SUCCESS;
@@ -2009,7 +2015,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
}
} else {
- mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
mad_recv_wc);
deref_mad_agent(mad_agent_priv);
}
@@ -2172,13 +2178,14 @@ handle_smi(struct ib_mad_port_private *port_priv,
return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
}
-static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
- struct ib_wc *wc)
+static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
struct ib_mad_qp_info *qp_info;
struct ib_mad_private_header *mad_priv_hdr;
struct ib_mad_private *recv, *response = NULL;
- struct ib_mad_list_head *mad_list;
struct ib_mad_agent_private *mad_agent;
int port_num;
int ret = IB_MAD_RESULT_SUCCESS;
@@ -2186,7 +2193,17 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
u16 resp_mad_pkey_index = 0;
bool opa;
- mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ /*
+ * Receive errors indicate that the QP has entered the error
+ * state - error handling/shutdown code will cleanup
+ */
+ return;
+ }
+
qp_info = mad_list->mad_queue->qp_info;
dequeue_mad(mad_list);
@@ -2227,7 +2244,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
response = alloc_mad_private(mad_size, GFP_KERNEL);
if (!response) {
dev_err(&port_priv->device->dev,
- "ib_mad_recv_done_handler no memory for response buffer\n");
+ "%s: no memory for response buffer\n", __func__);
goto out;
}
@@ -2413,11 +2430,12 @@ done:
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
}
-static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
- struct ib_wc *wc)
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr;
- struct ib_mad_list_head *mad_list;
struct ib_mad_qp_info *qp_info;
struct ib_mad_queue *send_queue;
struct ib_send_wr *bad_send_wr;
@@ -2425,7 +2443,14 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
unsigned long flags;
int ret;
- mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (!ib_mad_send_error(port_priv, wc))
+ return;
+ }
+
mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
mad_list);
send_queue = mad_list->mad_queue;
@@ -2490,24 +2515,15 @@ static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
}
-static void mad_error_handler(struct ib_mad_port_private *port_priv,
- struct ib_wc *wc)
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
{
- struct ib_mad_list_head *mad_list;
- struct ib_mad_qp_info *qp_info;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info;
struct ib_mad_send_wr_private *mad_send_wr;
int ret;
- /* Determine if failure was a send or receive */
- mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
- qp_info = mad_list->mad_queue->qp_info;
- if (mad_list->mad_queue == &qp_info->recv_queue)
- /*
- * Receive errors indicate that the QP has entered the error
- * state - error handling/shutdown code will cleanup
- */
- return;
-
/*
* Send errors will transition the QP to SQE - move
* QP to RTS and repost flushed work requests
@@ -2522,10 +2538,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
mad_send_wr->retry = 0;
ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
&bad_send_wr);
- if (ret)
- ib_mad_send_done_handler(port_priv, wc);
- } else
- ib_mad_send_done_handler(port_priv, wc);
+ if (!ret)
+ return false;
+ }
} else {
struct ib_qp_attr *attr;
@@ -2539,42 +2554,14 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
kfree(attr);
if (ret)
dev_err(&port_priv->device->dev,
- "mad_error_handler - ib_modify_qp to RTS : %d\n",
- ret);
+ "%s - ib_modify_qp to RTS: %d\n",
+ __func__, ret);
else
mark_sends_for_retry(qp_info);
}
- ib_mad_send_done_handler(port_priv, wc);
}
-}
-/*
- * IB MAD completion callback
- */
-static void ib_mad_completion_handler(struct work_struct *work)
-{
- struct ib_mad_port_private *port_priv;
- struct ib_wc wc;
-
- port_priv = container_of(work, struct ib_mad_port_private, work);
- ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
-
- while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
- if (wc.status == IB_WC_SUCCESS) {
- switch (wc.opcode) {
- case IB_WC_SEND:
- ib_mad_send_done_handler(port_priv, &wc);
- break;
- case IB_WC_RECV:
- ib_mad_recv_done_handler(port_priv, &wc);
- break;
- default:
- BUG_ON(1);
- break;
- }
- } else
- mad_error_handler(port_priv, &wc);
- }
+ return true;
}
static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
@@ -2716,7 +2703,7 @@ static void local_completions(struct work_struct *work)
* before request
*/
build_smp_wc(recv_mad_agent->agent.qp,
- (unsigned long) local->mad_send_wr,
+ local->mad_send_wr->send_wr.wr.wr_cqe,
be16_to_cpu(IB_LID_PERMISSIVE),
local->mad_send_wr->send_wr.pkey_index,
recv_mad_agent->agent.port_num, &wc);
@@ -2744,6 +2731,7 @@ static void local_completions(struct work_struct *work)
IB_MAD_SNOOP_RECVS);
recv_mad_agent->agent.recv_handler(
&recv_mad_agent->agent,
+ &local->mad_send_wr->send_buf,
&local->mad_priv->header.recv_wc);
spin_lock_irqsave(&recv_mad_agent->lock, flags);
atomic_dec(&recv_mad_agent->refcount);
@@ -2855,17 +2843,6 @@ static void timeout_sends(struct work_struct *work)
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
}
-static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
-{
- struct ib_mad_port_private *port_priv = cq->cq_context;
- unsigned long flags;
-
- spin_lock_irqsave(&ib_mad_port_list_lock, flags);
- if (!list_empty(&port_priv->port_list))
- queue_work(port_priv->wq, &port_priv->work);
- spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
-}
-
/*
* Allocate receive MADs and post receive WRs for them
*/
@@ -2913,8 +2890,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
break;
}
mad_priv->header.mapping = sg_list.addr;
- recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
mad_priv->header.mad_list.mad_queue = recv_queue;
+ mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
+ recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
/* Post receive WR */
spin_lock_irqsave(&recv_queue->lock, flags);
@@ -3151,7 +3129,6 @@ static int ib_mad_port_open(struct ib_device *device,
unsigned long flags;
char name[sizeof "ib_mad123"];
int has_smi;
- struct ib_cq_init_attr cq_attr = {};
if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
return -EFAULT;
@@ -3179,10 +3156,8 @@ static int ib_mad_port_open(struct ib_device *device,
if (has_smi)
cq_size *= 2;
- cq_attr.cqe = cq_size;
- port_priv->cq = ib_create_cq(port_priv->device,
- ib_mad_thread_completion_handler,
- NULL, port_priv, &cq_attr);
+ port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(port_priv->cq)) {
dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
ret = PTR_ERR(port_priv->cq);
@@ -3211,7 +3186,6 @@ static int ib_mad_port_open(struct ib_device *device,
ret = -ENOMEM;
goto error8;
}
- INIT_WORK(&port_priv->work, ib_mad_completion_handler);
spin_lock_irqsave(&ib_mad_port_list_lock, flags);
list_add_tail(&port_priv->port_list, &ib_mad_port_list);
@@ -3238,7 +3212,7 @@ error7:
error6:
ib_dealloc_pd(port_priv->pd);
error4:
- ib_destroy_cq(port_priv->cq);
+ ib_free_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
cleanup_recv_queue(&port_priv->qp_info[0]);
error3:
@@ -3271,7 +3245,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
destroy_mad_qp(&port_priv->qp_info[1]);
destroy_mad_qp(&port_priv->qp_info[0]);
ib_dealloc_pd(port_priv->pd);
- ib_destroy_cq(port_priv->cq);
+ ib_free_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
cleanup_recv_queue(&port_priv->qp_info[0]);
/* XXX: Handle deallocation of MAD registration tables */
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 990698a..28669f6 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -64,6 +64,7 @@
struct ib_mad_list_head {
struct list_head list;
+ struct ib_cqe cqe;
struct ib_mad_queue *mad_queue;
};
@@ -204,7 +205,6 @@ struct ib_mad_port_private {
struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
struct list_head agent_list;
struct workqueue_struct *wq;
- struct work_struct work;
struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
};
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index bb6685f..250937c 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -723,14 +723,27 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
struct ib_sa_mcmember_rec *rec,
+ struct net_device *ndev,
+ enum ib_gid_type gid_type,
struct ib_ah_attr *ah_attr)
{
int ret;
u16 gid_index;
u8 p;
- ret = ib_find_cached_gid(device, &rec->port_gid,
- NULL, &p, &gid_index);
+ if (rdma_protocol_roce(device, port_num)) {
+ ret = ib_find_cached_gid_by_port(device, &rec->port_gid,
+ gid_type, port_num,
+ ndev,
+ &gid_index);
+ } else if (rdma_protocol_ib(device, port_num)) {
+ ret = ib_find_cached_gid(device, &rec->port_gid,
+ IB_GID_TYPE_IB, NULL, &p,
+ &gid_index);
+ } else {
+ ret = -EINVAL;
+ }
+
if (ret)
return ret;
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 178f984..06556c3 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -67,17 +67,53 @@ struct netdev_event_work {
struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
};
+static const struct {
+ bool (*is_supported)(const struct ib_device *device, u8 port_num);
+ enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+ {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
+ {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
+};
+
+#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
+{
+ int i;
+ unsigned int ret_flags = 0;
+
+ if (!rdma_protocol_roce(ib_dev, port))
+ return 1UL << IB_GID_TYPE_IB;
+
+ for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+ if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
+ ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+ return ret_flags;
+}
+EXPORT_SYMBOL(roce_gid_type_mask_support);
+
static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
u8 port, union ib_gid *gid,
struct ib_gid_attr *gid_attr)
{
- switch (gid_op) {
- case GID_ADD:
- ib_cache_gid_add(ib_dev, port, gid, gid_attr);
- break;
- case GID_DEL:
- ib_cache_gid_del(ib_dev, port, gid, gid_attr);
- break;
+ int i;
+ unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
+ if ((1UL << i) & gid_type_mask) {
+ gid_attr->gid_type = i;
+ switch (gid_op) {
+ case GID_ADD:
+ ib_cache_gid_add(ib_dev, port,
+ gid, gid_attr);
+ break;
+ case GID_DEL:
+ ib_cache_gid_del(ib_dev, port,
+ gid, gid_attr);
+ break;
+ }
+ }
}
}
@@ -103,18 +139,6 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de
return BONDING_SLAVE_STATE_NA;
}
-static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
-{
- struct net_device *_upper = NULL;
- struct list_head *iter;
-
- netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
- if (_upper == upper)
- break;
-
- return _upper == upper;
-}
-
#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \
BONDING_SLAVE_STATE_NA)
static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
@@ -132,7 +156,7 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
if (!real_dev)
real_dev = event_ndev;
- res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ res = ((rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) &&
(is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
REQUIRED_BOND_STATES)) ||
real_dev == rdma_ndev);
@@ -178,7 +202,7 @@ static int upper_device_filter(struct ib_device *ib_dev, u8 port,
return 1;
rcu_read_lock();
- res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+ res = rdma_is_upper_dev_rcu(rdma_ndev, event_ndev);
rcu_read_unlock();
return res;
@@ -203,10 +227,12 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
u8 port, struct net_device *event_ndev,
struct net_device *rdma_ndev)
{
+ unsigned long gid_type_mask;
+
rcu_read_lock();
if (!rdma_ndev ||
((rdma_ndev != event_ndev &&
- !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+ !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
is_eth_active_slave_of_bonding_rcu(rdma_ndev,
netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
BONDING_SLAVE_STATE_INACTIVE)) {
@@ -215,7 +241,9 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
}
rcu_read_unlock();
- ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask,
IB_CACHE_GID_DEFAULT_MODE_SET);
}
@@ -234,12 +262,17 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
rcu_read_lock();
- if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ if (rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) &&
is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
BONDING_SLAVE_STATE_INACTIVE) {
+ unsigned long gid_type_mask;
+
rcu_read_unlock();
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ gid_type_mask,
IB_CACHE_GID_DEFAULT_MODE_DELETE);
} else {
rcu_read_unlock();
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index a95a32b..f334090 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -49,7 +49,9 @@
#include <net/netlink.h>
#include <uapi/rdma/ib_user_sa.h>
#include <rdma/ib_marshall.h>
+#include <rdma/ib_addr.h>
#include "sa.h"
+#include "core_priv.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand subnet administration query support");
@@ -715,7 +717,9 @@ static int ib_nl_handle_set_timeout(struct sk_buff *skb,
struct nlattr *tb[LS_NLA_TYPE_MAX];
int ret;
- if (!netlink_capable(skb, CAP_NET_ADMIN))
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk) ||
+ !netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
@@ -789,7 +793,9 @@ static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
int found = 0;
int ret;
- if (!netlink_capable(skb, CAP_NET_ADMIN))
+ if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk) ||
+ !netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
spin_lock_irqsave(&ib_nl_request_lock, flags);
@@ -996,7 +1002,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
{
int ret;
u16 gid_index;
- int force_grh;
+ int use_roce;
+ struct net_device *ndev = NULL;
memset(ah_attr, 0, sizeof *ah_attr);
ah_attr->dlid = be16_to_cpu(rec->dlid);
@@ -1006,16 +1013,71 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
ah_attr->port_num = port_num;
ah_attr->static_rate = rec->rate;
- force_grh = rdma_cap_eth_ah(device, port_num);
+ use_roce = rdma_cap_eth_ah(device, port_num);
+
+ if (use_roce) {
+ struct net_device *idev;
+ struct net_device *resolved_dev;
+ struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex,
+ .net = rec->net ? rec->net :
+ &init_net};
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+
+ if (!device->get_netdev)
+ return -EOPNOTSUPP;
+
+ rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid);
+ rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid);
+
+ /* validate the route */
+ ret = rdma_resolve_ip_route(&sgid_addr._sockaddr,
+ &dgid_addr._sockaddr, &dev_addr);
+ if (ret)
+ return ret;
- if (rec->hop_limit > 1 || force_grh) {
- struct net_device *ndev = ib_get_ndev_from_path(rec);
+ if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+ dev_addr.network == RDMA_NETWORK_IPV6) &&
+ rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+ return -EINVAL;
+
+ idev = device->get_netdev(device, port_num);
+ if (!idev)
+ return -ENODEV;
+
+ resolved_dev = dev_get_by_index(dev_addr.net,
+ dev_addr.bound_dev_if);
+ if (resolved_dev->flags & IFF_LOOPBACK) {
+ dev_put(resolved_dev);
+ resolved_dev = idev;
+ dev_hold(resolved_dev);
+ }
+ ndev = ib_get_ndev_from_path(rec);
+ rcu_read_lock();
+ if ((ndev && ndev != resolved_dev) ||
+ (resolved_dev != idev &&
+ !rdma_is_upper_dev_rcu(idev, resolved_dev)))
+ ret = -EHOSTUNREACH;
+ rcu_read_unlock();
+ dev_put(idev);
+ dev_put(resolved_dev);
+ if (ret) {
+ if (ndev)
+ dev_put(ndev);
+ return ret;
+ }
+ }
+ if (rec->hop_limit > 1 || use_roce) {
ah_attr->ah_flags = IB_AH_GRH;
ah_attr->grh.dgid = rec->dgid;
- ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num,
- &gid_index);
+ ret = ib_find_cached_gid_by_port(device, &rec->sgid,
+ rec->gid_type, port_num, ndev,
+ &gid_index);
if (ret) {
if (ndev)
dev_put(ndev);
@@ -1029,9 +1091,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
if (ndev)
dev_put(ndev);
}
- if (force_grh) {
+
+ if (use_roce)
memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
- }
+
return 0;
}
EXPORT_SYMBOL(ib_init_ah_from_path);
@@ -1157,6 +1220,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
mad->data, &rec);
rec.net = NULL;
rec.ifindex = 0;
+ rec.gid_type = IB_GID_TYPE_IB;
memset(rec.dmac, 0, ETH_ALEN);
query->callback(status, &rec, query->context);
} else
@@ -1609,14 +1673,15 @@ static void send_handler(struct ib_mad_agent *agent,
}
static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct ib_sa_query *query;
- struct ib_mad_send_buf *mad_buf;
- mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
- query = mad_buf->context[0];
+ if (!send_buf)
+ return;
+ query = send_buf->context[0];
if (query->callback) {
if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
query->callback(query,
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index b1f37d4..3de9351 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -37,15 +37,27 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/string.h>
+#include <linux/netdevice.h>
#include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
+struct ib_port;
+
+struct gid_attr_group {
+ struct ib_port *port;
+ struct kobject kobj;
+ struct attribute_group ndev;
+ struct attribute_group type;
+};
struct ib_port {
struct kobject kobj;
struct ib_device *ibdev;
+ struct gid_attr_group *gid_attr_group;
struct attribute_group gid_group;
struct attribute_group pkey_group;
u8 port_num;
+ struct attribute_group *pma_table;
};
struct port_attribute {
@@ -65,6 +77,7 @@ struct port_table_attribute {
struct port_attribute attr;
char name[8];
int index;
+ __be16 attr_id;
};
static ssize_t port_attr_show(struct kobject *kobj,
@@ -84,6 +97,24 @@ static const struct sysfs_ops port_sysfs_ops = {
.show = port_attr_show
};
+static ssize_t gid_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct gid_attr_group,
+ kobj)->port;
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+ .show = gid_attr_show
+};
+
static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
char *buf)
{
@@ -281,6 +312,46 @@ static struct attribute *port_default_attrs[] = {
NULL
};
+static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf)
+{
+ if (!gid_attr->ndev)
+ return -EINVAL;
+
+ return sprintf(buf, "%s\n", gid_attr->ndev->name);
+}
+
+static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf)
+{
+ return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(struct ib_port *p,
+ struct port_attribute *attr,
+ char *buf,
+ size_t (*print)(struct ib_gid_attr *gid_attr,
+ char *buf))
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr = {};
+ ssize_t ret;
+ va_list args;
+
+ ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid,
+ &gid_attr);
+ if (ret)
+ goto err;
+
+ ret = print(&gid_attr, buf);
+
+err:
+ if (gid_attr.ndev)
+ dev_put(gid_attr.ndev);
+ va_end(args);
+ return ret;
+}
+
static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
char *buf)
{
@@ -296,6 +367,19 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
return sprintf(buf, "%pI6\n", gid.raw);
}
+static ssize_t show_port_gid_attr_ndev(struct ib_port *p,
+ struct port_attribute *attr, char *buf)
+{
+ return _show_port_gid_attr(p, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_port *p,
+ struct port_attribute *attr,
+ char *buf)
+{
+ return _show_port_gid_attr(p, attr, buf, print_gid_type);
+}
+
static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
char *buf)
{
@@ -314,24 +398,32 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
struct port_table_attribute port_pma_attr_##_name = { \
.attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
- .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \
+ .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \
+ .attr_id = IB_PMA_PORT_COUNTERS , \
}
-static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
- char *buf)
+#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \
+struct port_table_attribute port_pma_attr_ext_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16), \
+ .attr_id = IB_PMA_PORT_COUNTERS_EXT , \
+}
+
+/*
+ * Get a Perfmgmt MAD block of data.
+ * Returns error code or the number of bytes retrieved.
+ */
+static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
+ void *data, int offset, size_t size)
{
- struct port_table_attribute *tab_attr =
- container_of(attr, struct port_table_attribute, attr);
- int offset = tab_attr->index & 0xffff;
- int width = (tab_attr->index >> 16) & 0xff;
- struct ib_mad *in_mad = NULL;
- struct ib_mad *out_mad = NULL;
+ struct ib_mad *in_mad;
+ struct ib_mad *out_mad;
size_t mad_size = sizeof(*out_mad);
u16 out_mad_pkey_index = 0;
ssize_t ret;
- if (!p->ibdev->process_mad)
- return sprintf(buf, "N/A (no PMA)\n");
+ if (!dev->process_mad)
+ return -ENOSYS;
in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -344,12 +436,13 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
in_mad->mad_hdr.class_version = 1;
in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
- in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */
+ in_mad->mad_hdr.attr_id = attr;
- in_mad->data[41] = p->port_num; /* PortSelect field */
+ if (attr != IB_PMA_CLASS_PORT_INFO)
+ in_mad->data[41] = port_num; /* PortSelect field */
- if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
- p->port_num, NULL, NULL,
+ if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
+ port_num, NULL, NULL,
(const struct ib_mad_hdr *)in_mad, mad_size,
(struct ib_mad_hdr *)out_mad, &mad_size,
&out_mad_pkey_index) &
@@ -358,31 +451,54 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
ret = -EINVAL;
goto out;
}
+ memcpy(data, out_mad->data + offset, size);
+ ret = size;
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return ret;
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int offset = tab_attr->index & 0xffff;
+ int width = (tab_attr->index >> 16) & 0xff;
+ ssize_t ret;
+ u8 data[8];
+
+ ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data,
+ 40 + offset / 8, sizeof(data));
+ if (ret < 0)
+ return sprintf(buf, "N/A (no PMA)\n");
switch (width) {
case 4:
- ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+ ret = sprintf(buf, "%u\n", (*data >>
(4 - (offset % 8))) & 0xf);
break;
case 8:
- ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+ ret = sprintf(buf, "%u\n", *data);
break;
case 16:
ret = sprintf(buf, "%u\n",
- be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+ be16_to_cpup((__be16 *)data));
break;
case 32:
ret = sprintf(buf, "%u\n",
- be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+ be32_to_cpup((__be32 *)data));
+ break;
+ case 64:
+ ret = sprintf(buf, "%llu\n",
+ be64_to_cpup((__be64 *)data));
break;
+
default:
ret = 0;
}
-out:
- kfree(in_mad);
- kfree(out_mad);
-
return ret;
}
@@ -403,6 +519,18 @@ static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+/*
+ * Counters added by extended set
+ */
+static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64);
+static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256);
+static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320);
+static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384);
+static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448);
+static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512);
+
static struct attribute *pma_attrs[] = {
&port_pma_attr_symbol_error.attr.attr,
&port_pma_attr_link_error_recovery.attr.attr,
@@ -423,11 +551,65 @@ static struct attribute *pma_attrs[] = {
NULL
};
+static struct attribute *pma_attrs_ext[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_xmit_packets.attr.attr,
+ &port_pma_attr_ext_multicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_multicast_xmit_packets.attr.attr,
+ NULL
+};
+
+static struct attribute *pma_attrs_noietf[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ NULL
+};
+
static struct attribute_group pma_group = {
.name = "counters",
.attrs = pma_attrs
};
+static struct attribute_group pma_group_ext = {
+ .name = "counters",
+ .attrs = pma_attrs_ext
+};
+
+static struct attribute_group pma_group_noietf = {
+ .name = "counters",
+ .attrs = pma_attrs_noietf
+};
+
static void ib_port_release(struct kobject *kobj)
{
struct ib_port *p = container_of(kobj, struct ib_port, kobj);
@@ -451,12 +633,41 @@ static void ib_port_release(struct kobject *kobj)
kfree(p);
}
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+ struct gid_attr_group *g = container_of(kobj, struct gid_attr_group,
+ kobj);
+ struct attribute *a;
+ int i;
+
+ if (g->ndev.attrs) {
+ for (i = 0; (a = g->ndev.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(g->ndev.attrs);
+ }
+
+ if (g->type.attrs) {
+ for (i = 0; (a = g->type.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(g->type.attrs);
+ }
+
+ kfree(g);
+}
+
static struct kobj_type port_type = {
.release = ib_port_release,
.sysfs_ops = &port_sysfs_ops,
.default_attrs = port_default_attrs
};
+static struct kobj_type gid_attr_type = {
+ .sysfs_ops = &gid_attr_sysfs_ops,
+ .release = ib_port_gid_attr_release
+};
+
static struct attribute **
alloc_group_attrs(ssize_t (*show)(struct ib_port *,
struct port_attribute *, char *buf),
@@ -500,6 +711,31 @@ err:
return NULL;
}
+/*
+ * Figure out which counter table to use depending on
+ * the device capabilities.
+ */
+static struct attribute_group *get_counter_table(struct ib_device *dev,
+ int port_num)
+{
+ struct ib_class_port_info cpi;
+
+ if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
+ &cpi, 40, sizeof(cpi)) >= 0) {
+
+ if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH)
+ /* We have extended counters */
+ return &pma_group_ext;
+
+ if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+ /* But not the IETF ones */
+ return &pma_group_noietf;
+ }
+
+ /* Fall back to normal counters */
+ return &pma_group;
+}
+
static int add_port(struct ib_device *device, int port_num,
int (*port_callback)(struct ib_device *,
u8, struct kobject *))
@@ -528,9 +764,24 @@ static int add_port(struct ib_device *device, int port_num,
return ret;
}
- ret = sysfs_create_group(&p->kobj, &pma_group);
- if (ret)
+ p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
+ if (!p->gid_attr_group) {
+ ret = -ENOMEM;
goto err_put;
+ }
+
+ p->gid_attr_group->port = p;
+ ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
+ &p->kobj, "gid_attrs");
+ if (ret) {
+ kfree(p->gid_attr_group);
+ goto err_put;
+ }
+
+ p->pma_table = get_counter_table(device, port_num);
+ ret = sysfs_create_group(&p->kobj, p->pma_table);
+ if (ret)
+ goto err_put_gid_attrs;
p->gid_group.name = "gids";
p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
@@ -543,12 +794,38 @@ static int add_port(struct ib_device *device, int port_num,
if (ret)
goto err_free_gid;
+ p->gid_attr_group->ndev.name = "ndevs";
+ p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev,
+ attr.gid_tbl_len);
+ if (!p->gid_attr_group->ndev.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid;
+ }
+
+ ret = sysfs_create_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->ndev);
+ if (ret)
+ goto err_free_gid_ndev;
+
+ p->gid_attr_group->type.name = "types";
+ p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type,
+ attr.gid_tbl_len);
+ if (!p->gid_attr_group->type.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid_ndev;
+ }
+
+ ret = sysfs_create_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->type);
+ if (ret)
+ goto err_free_gid_type;
+
p->pkey_group.name = "pkeys";
p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
attr.pkey_tbl_len);
if (!p->pkey_group.attrs) {
ret = -ENOMEM;
- goto err_remove_gid;
+ goto err_remove_gid_type;
}
ret = sysfs_create_group(&p->kobj, &p->pkey_group);
@@ -576,6 +853,28 @@ err_free_pkey:
kfree(p->pkey_group.attrs);
p->pkey_group.attrs = NULL;
+err_remove_gid_type:
+ sysfs_remove_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->type);
+
+err_free_gid_type:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_attr_group->type.attrs[i]);
+
+ kfree(p->gid_attr_group->type.attrs);
+ p->gid_attr_group->type.attrs = NULL;
+
+err_remove_gid_ndev:
+ sysfs_remove_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->ndev);
+
+err_free_gid_ndev:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_attr_group->ndev.attrs[i]);
+
+ kfree(p->gid_attr_group->ndev.attrs);
+ p->gid_attr_group->ndev.attrs = NULL;
+
err_remove_gid:
sysfs_remove_group(&p->kobj, &p->gid_group);
@@ -587,7 +886,10 @@ err_free_gid:
p->gid_group.attrs = NULL;
err_remove_pma:
- sysfs_remove_group(&p->kobj, &pma_group);
+ sysfs_remove_group(&p->kobj, p->pma_table);
+
+err_put_gid_attrs:
+ kobject_put(&p->gid_attr_group->kobj);
err_put:
kobject_put(&p->kobj);
@@ -614,18 +916,12 @@ static ssize_t show_sys_image_guid(struct device *device,
struct device_attribute *dev_attr, char *buf)
{
struct ib_device *dev = container_of(device, struct ib_device, dev);
- struct ib_device_attr attr;
- ssize_t ret;
-
- ret = ib_query_device(dev, &attr);
- if (ret)
- return ret;
return sprintf(buf, "%04x:%04x:%04x:%04x\n",
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
}
static ssize_t show_node_guid(struct device *device,
@@ -800,9 +1096,14 @@ static void free_port_list_attributes(struct ib_device *device)
list_for_each_entry_safe(p, t, &device->port_list, entry) {
struct ib_port *port = container_of(p, struct ib_port, kobj);
list_del(&p->entry);
- sysfs_remove_group(p, &pma_group);
+ sysfs_remove_group(p, port->pma_table);
sysfs_remove_group(p, &port->pkey_group);
sysfs_remove_group(p, &port->gid_group);
+ sysfs_remove_group(&port->gid_attr_group->kobj,
+ &port->gid_attr_group->ndev);
+ sysfs_remove_group(&port->gid_attr_group->kobj,
+ &port->gid_attr_group->type);
+ kobject_put(&port->gid_attr_group->kobj);
kobject_put(p);
}
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 72feee6..19837d2 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -35,6 +35,7 @@
#include <linux/string.h>
#include <linux/export.h>
#include <linux/if_ether.h>
+#include <linux/ip.h>
#include <rdma/ib_pack.h>
@@ -116,6 +117,72 @@ static const struct ib_field vlan_table[] = {
.size_bits = 16 }
};
+static const struct ib_field ip4_table[] = {
+ { STRUCT_FIELD(ip4, ver),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, hdr_len),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, tos),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, tot_len),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, id),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, frag_off),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, ttl),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, protocol),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, check),
+ .offset_words = 2,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, saddr),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(ip4, daddr),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 32 }
+};
+
+static const struct ib_field udp_table[] = {
+ { STRUCT_FIELD(udp, sport),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, dport),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, csum),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
static const struct ib_field grh_table[] = {
{ STRUCT_FIELD(grh, ip_version),
.offset_words = 0,
@@ -213,26 +280,57 @@ static const struct ib_field deth_table[] = {
.size_bits = 24 }
};
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+ struct iphdr iph;
+
+ iph.ihl = 5;
+ iph.version = 4;
+ iph.tos = header->ip4.tos;
+ iph.tot_len = header->ip4.tot_len;
+ iph.id = header->ip4.id;
+ iph.frag_off = header->ip4.frag_off;
+ iph.ttl = header->ip4.ttl;
+ iph.protocol = header->ip4.protocol;
+ iph.check = 0;
+ iph.saddr = header->ip4.saddr;
+ iph.daddr = header->ip4.daddr;
+
+ return ip_fast_csum((u8 *)&iph, iph.ihl);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
/**
* ib_ud_header_init - Initialize UD header structure
* @payload_bytes:Length of packet payload
* @lrh_present: specify if LRH is present
* @eth_present: specify if Eth header is present
* @vlan_present: packet is tagged vlan
- * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @grh_present: GRH flag (if non-zero, GRH will be included)
+ * @ip_version: if non-zero, IP header, V4 or V6, will be included
+ * @udp_present :if non-zero, UDP header will be included
* @immediate_present: specify if immediate data is present
* @header:Structure to initialize
*/
-void ib_ud_header_init(int payload_bytes,
- int lrh_present,
- int eth_present,
- int vlan_present,
- int grh_present,
- int immediate_present,
- struct ib_ud_header *header)
+int ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int ip_version,
+ int udp_present,
+ int immediate_present,
+ struct ib_ud_header *header)
{
+ grh_present = grh_present && !ip_version;
memset(header, 0, sizeof *header);
+ /*
+ * UDP header without IP header doesn't make sense
+ */
+ if (udp_present && ip_version != 4 && ip_version != 6)
+ return -EINVAL;
+
if (lrh_present) {
u16 packet_length;
@@ -252,7 +350,7 @@ void ib_ud_header_init(int payload_bytes,
if (vlan_present)
header->eth.type = cpu_to_be16(ETH_P_8021Q);
- if (grh_present) {
+ if (ip_version == 6 || grh_present) {
header->grh.ip_version = 6;
header->grh.payload_length =
cpu_to_be16((IB_BTH_BYTES +
@@ -260,8 +358,30 @@ void ib_ud_header_init(int payload_bytes,
payload_bytes +
4 + /* ICRC */
3) & ~3); /* round up */
- header->grh.next_header = 0x1b;
+ header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b;
+ }
+
+ if (ip_version == 4) {
+ int udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+ header->ip4.ver = 4; /* version 4 */
+ header->ip4.hdr_len = 5; /* 5 words */
+ header->ip4.tot_len =
+ cpu_to_be16(IB_IP4_BYTES +
+ udp_bytes +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4); /* ICRC */
+ header->ip4.protocol = IPPROTO_UDP;
}
+ if (udp_present && ip_version)
+ header->udp.length =
+ cpu_to_be16(IB_UDP_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4); /* ICRC */
if (immediate_present)
header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
@@ -273,8 +393,11 @@ void ib_ud_header_init(int payload_bytes,
header->lrh_present = lrh_present;
header->eth_present = eth_present;
header->vlan_present = vlan_present;
- header->grh_present = grh_present;
+ header->grh_present = grh_present || (ip_version == 6);
+ header->ipv4_present = ip_version == 4;
+ header->udp_present = udp_present;
header->immediate_present = immediate_present;
+ return 0;
}
EXPORT_SYMBOL(ib_ud_header_init);
@@ -311,6 +434,16 @@ int ib_ud_header_pack(struct ib_ud_header *header,
&header->grh, buf + len);
len += IB_GRH_BYTES;
}
+ if (header->ipv4_present) {
+ ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+ &header->ip4, buf + len);
+ len += IB_IP4_BYTES;
+ }
+ if (header->udp_present) {
+ ib_pack(udp_table, ARRAY_SIZE(udp_table),
+ &header->udp, buf + len);
+ len += IB_UDP_BYTES;
+ }
ib_pack(bth_table, ARRAY_SIZE(bth_table),
&header->bth, buf + len);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 40becdb..e69bf26 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -232,7 +232,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
ib_ucontext_notifier_end_account(context);
}
-static struct mmu_notifier_ops ib_umem_notifiers = {
+static const struct mmu_notifier_ops ib_umem_notifiers = {
.release = ib_umem_notifier_release,
.invalidate_page = ib_umem_notifier_invalidate_page,
.invalidate_range_start = ib_umem_notifier_invalidate_range_start,
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 57f281f..415a318 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -210,6 +210,7 @@ static void send_handler(struct ib_mad_agent *agent,
}
static void recv_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct ib_umad_file *file = agent->context;
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 94bbd8c..612ccfd 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -204,6 +204,8 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);
void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
+int uverbs_dealloc_mw(struct ib_mw *mw);
+
struct ib_uverbs_flow_spec {
union {
union {
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 1c02dea..6ffc9c4 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -291,9 +291,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
struct ib_uverbs_get_context cmd;
struct ib_uverbs_get_context_resp resp;
struct ib_udata udata;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- struct ib_device_attr dev_attr;
-#endif
struct ib_ucontext *ucontext;
struct file *filp;
int ret;
@@ -342,10 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext->odp_mrs_count = 0;
INIT_LIST_HEAD(&ucontext->no_private_counters);
- ret = ib_query_device(ib_dev, &dev_attr);
- if (ret)
- goto err_free;
- if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
ucontext->invalidate_range = NULL;
#endif
@@ -447,8 +441,6 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
{
struct ib_uverbs_query_device cmd;
struct ib_uverbs_query_device_resp resp;
- struct ib_device_attr attr;
- int ret;
if (out_len < sizeof resp)
return -ENOSPC;
@@ -456,12 +448,8 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_device(ib_dev, &attr);
- if (ret)
- return ret;
-
memset(&resp, 0, sizeof resp);
- copy_query_dev_fields(file, ib_dev, &resp, &attr);
+ copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp))
@@ -986,11 +974,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
}
if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
- struct ib_device_attr attr;
-
- ret = ib_query_device(pd->device, &attr);
- if (ret || !(attr.device_cap_flags &
- IB_DEVICE_ON_DEMAND_PAGING)) {
+ if (!(pd->device->attrs.device_cap_flags &
+ IB_DEVICE_ON_DEMAND_PAGING)) {
pr_debug("ODP support not available\n");
ret = -EINVAL;
goto err_put;
@@ -1008,7 +993,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
mr->pd = pd;
mr->uobject = uobj;
atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
uobj->object = mr;
ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
@@ -1106,11 +1090,6 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
}
}
- if (atomic_read(&mr->usecnt)) {
- ret = -EBUSY;
- goto put_uobj_pd;
- }
-
old_pd = mr->pd;
ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
cmd.length, cmd.hca_va,
@@ -1258,7 +1237,7 @@ err_copy:
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
err_unalloc:
- ib_dealloc_mw(mw);
+ uverbs_dealloc_mw(mw);
err_put:
put_pd_read(pd);
@@ -1287,7 +1266,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
mw = uobj->object;
- ret = ib_dealloc_mw(mw);
+ ret = uverbs_dealloc_mw(mw);
if (!ret)
uobj->live = 0;
@@ -1845,7 +1824,10 @@ static int create_qp(struct ib_uverbs_file *file,
sizeof(cmd->create_flags))
attr.create_flags = cmd->create_flags;
- if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+ if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+ IB_QP_CREATE_CROSS_CHANNEL |
+ IB_QP_CREATE_MANAGED_SEND |
+ IB_QP_CREATE_MANAGED_RECV)) {
ret = -EINVAL;
goto err_put;
}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index e3ef288..39680ae 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -133,6 +133,17 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
static void ib_uverbs_add_one(struct ib_device *device);
static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
+int uverbs_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd = mw->pd;
+ int ret;
+
+ ret = mw->device->dealloc_mw(mw);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+ return ret;
+}
+
static void ib_uverbs_release_dev(struct kobject *kobj)
{
struct ib_uverbs_device *dev =
@@ -224,7 +235,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
struct ib_mw *mw = uobj->object;
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
- ib_dealloc_mw(mw);
+ uverbs_dealloc_mw(mw);
kfree(uobj);
}
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index 7d2f14c..af020f8 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -144,5 +144,6 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
memset(dst->dmac, 0, sizeof(dst->dmac));
dst->net = NULL;
dst->ifindex = 0;
+ dst->gid_type = IB_GID_TYPE_IB;
}
EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 545906d..5af6d02 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -229,12 +229,6 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
struct ib_pd *ib_alloc_pd(struct ib_device *device)
{
struct ib_pd *pd;
- struct ib_device_attr devattr;
- int rc;
-
- rc = ib_query_device(device, &devattr);
- if (rc)
- return ERR_PTR(rc);
pd = device->alloc_pd(device, NULL, NULL);
if (IS_ERR(pd))
@@ -245,7 +239,7 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device)
pd->local_mr = NULL;
atomic_set(&pd->usecnt, 0);
- if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
pd->local_dma_lkey = device->local_dma_lkey;
else {
struct ib_mr *mr;
@@ -311,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
}
EXPORT_SYMBOL(ib_create_ah);
+static int ib_get_header_version(const union rdma_network_hdr *hdr)
+{
+ const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+ struct iphdr ip4h_checked;
+ const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+ /* If it's IPv6, the version must be 6, otherwise, the first
+ * 20 bytes (before the IPv4 header) are garbled.
+ */
+ if (ip6h->version != 6)
+ return (ip4h->version == 4) ? 4 : 0;
+ /* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+ /* RoCE v2 requires no options, thus header length
+ * must be 5 words
+ */
+ if (ip4h->ihl != 5)
+ return 6;
+
+ /* Verify checksum.
+ * We can't write on scattered buffers so we need to copy to
+ * temp buffer.
+ */
+ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+ ip4h_checked.check = 0;
+ ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+ /* if IPv4 header checksum is OK, believe it */
+ if (ip4h->check == ip4h_checked.check)
+ return 4;
+ return 6;
+}
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+ u8 port_num,
+ const struct ib_grh *grh)
+{
+ int grh_version;
+
+ if (rdma_protocol_ib(device, port_num))
+ return RDMA_NETWORK_IB;
+
+ grh_version = ib_get_header_version((union rdma_network_hdr *)grh);
+
+ if (grh_version == 4)
+ return RDMA_NETWORK_IPV4;
+
+ if (grh->next_hdr == IPPROTO_UDP)
+ return RDMA_NETWORK_IPV6;
+
+ return RDMA_NETWORK_ROCE_V1;
+}
+
struct find_gid_index_context {
u16 vlan_id;
+ enum ib_gid_type gid_type;
};
static bool find_gid_index(const union ib_gid *gid,
@@ -322,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid,
struct find_gid_index_context *ctx =
(struct find_gid_index_context *)context;
+ if (ctx->gid_type != gid_attr->gid_type)
+ return false;
+
if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
(is_vlan_dev(gid_attr->ndev) &&
vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
@@ -332,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid,
static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
u16 vlan_id, const union ib_gid *sgid,
+ enum ib_gid_type gid_type,
u16 *gid_index)
{
- struct find_gid_index_context context = {.vlan_id = vlan_id};
+ struct find_gid_index_context context = {.vlan_id = vlan_id,
+ .gid_type = gid_type};
return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
&context, gid_index);
}
+static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr,
+ enum rdma_network_type net_type,
+ union ib_gid *sgid, union ib_gid *dgid)
+{
+ struct sockaddr_in src_in;
+ struct sockaddr_in dst_in;
+ __be32 src_saddr, dst_saddr;
+
+ if (!sgid || !dgid)
+ return -EINVAL;
+
+ if (net_type == RDMA_NETWORK_IPV4) {
+ memcpy(&src_in.sin_addr.s_addr,
+ &hdr->roce4grh.saddr, 4);
+ memcpy(&dst_in.sin_addr.s_addr,
+ &hdr->roce4grh.daddr, 4);
+ src_saddr = src_in.sin_addr.s_addr;
+ dst_saddr = dst_in.sin_addr.s_addr;
+ ipv6_addr_set_v4mapped(src_saddr,
+ (struct in6_addr *)sgid);
+ ipv6_addr_set_v4mapped(dst_saddr,
+ (struct in6_addr *)dgid);
+ return 0;
+ } else if (net_type == RDMA_NETWORK_IPV6 ||
+ net_type == RDMA_NETWORK_IB) {
+ *dgid = hdr->ibgrh.dgid;
+ *sgid = hdr->ibgrh.sgid;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
const struct ib_wc *wc, const struct ib_grh *grh,
struct ib_ah_attr *ah_attr)
@@ -347,33 +432,72 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
u32 flow_class;
u16 gid_index;
int ret;
+ enum rdma_network_type net_type = RDMA_NETWORK_IB;
+ enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+ int hoplimit = 0xff;
+ union ib_gid dgid;
+ union ib_gid sgid;
memset(ah_attr, 0, sizeof *ah_attr);
if (rdma_cap_eth_ah(device, port_num)) {
+ if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+ net_type = wc->network_hdr_type;
+ else
+ net_type = ib_get_net_type_by_grh(device, port_num, grh);
+ gid_type = ib_network_to_gid_type(net_type);
+ }
+ ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+ &sgid, &dgid);
+ if (ret)
+ return ret;
+
+ if (rdma_protocol_roce(device, port_num)) {
+ int if_index = 0;
u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
wc->vlan_id : 0xffff;
+ struct net_device *idev;
+ struct net_device *resolved_dev;
if (!(wc->wc_flags & IB_WC_GRH))
return -EPROTOTYPE;
- if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
- !(wc->wc_flags & IB_WC_WITH_VLAN)) {
- ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
- ah_attr->dmac,
- wc->wc_flags & IB_WC_WITH_VLAN ?
- NULL : &vlan_id,
- 0);
- if (ret)
- return ret;
+ if (!device->get_netdev)
+ return -EOPNOTSUPP;
+
+ idev = device->get_netdev(device, port_num);
+ if (!idev)
+ return -ENODEV;
+
+ ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid,
+ ah_attr->dmac,
+ wc->wc_flags & IB_WC_WITH_VLAN ?
+ NULL : &vlan_id,
+ &if_index, &hoplimit);
+ if (ret) {
+ dev_put(idev);
+ return ret;
}
- ret = get_sgid_index_from_eth(device, port_num, vlan_id,
- &grh->dgid, &gid_index);
+ resolved_dev = dev_get_by_index(&init_net, if_index);
+ if (resolved_dev->flags & IFF_LOOPBACK) {
+ dev_put(resolved_dev);
+ resolved_dev = idev;
+ dev_hold(resolved_dev);
+ }
+ rcu_read_lock();
+ if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev,
+ resolved_dev))
+ ret = -EHOSTUNREACH;
+ rcu_read_unlock();
+ dev_put(idev);
+ dev_put(resolved_dev);
if (ret)
return ret;
- if (wc->wc_flags & IB_WC_WITH_SMAC)
- memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+ ret = get_sgid_index_from_eth(device, port_num, vlan_id,
+ &dgid, gid_type, &gid_index);
+ if (ret)
+ return ret;
}
ah_attr->dlid = wc->slid;
@@ -383,10 +507,11 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
if (wc->wc_flags & IB_WC_GRH) {
ah_attr->ah_flags = IB_AH_GRH;
- ah_attr->grh.dgid = grh->sgid;
+ ah_attr->grh.dgid = sgid;
if (!rdma_cap_eth_ah(device, port_num)) {
- ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+ ret = ib_find_cached_gid_by_port(device, &dgid,
+ IB_GID_TYPE_IB,
port_num, NULL,
&gid_index);
if (ret)
@@ -396,7 +521,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
ah_attr->grh.sgid_index = (u8) gid_index;
flow_class = be32_to_cpu(grh->version_tclass_flow);
ah_attr->grh.flow_label = flow_class & 0xFFFFF;
- ah_attr->grh.hop_limit = 0xFF;
+ ah_attr->grh.hop_limit = hoplimit;
ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
}
return 0;
@@ -1014,6 +1139,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
union ib_gid sgid;
struct ib_gid_attr sgid_attr;
int ifindex;
+ int hop_limit;
ret = ib_query_gid(qp->device,
qp_attr->ah_attr.port_num,
@@ -1028,12 +1154,14 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
ifindex = sgid_attr.ndev->ifindex;
- ret = rdma_addr_find_dmac_by_grh(&sgid,
- &qp_attr->ah_attr.grh.dgid,
- qp_attr->ah_attr.dmac,
- NULL, ifindex);
+ ret = rdma_addr_find_l2_eth_by_grh(&sgid,
+ &qp_attr->ah_attr.grh.dgid,
+ qp_attr->ah_attr.dmac,
+ NULL, &ifindex, &hop_limit);
dev_put(sgid_attr.ndev);
+
+ qp_attr->ah_attr.grh.hop_limit = hop_limit;
}
}
out:
@@ -1215,29 +1343,17 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
mr->pd = pd;
mr->uobject = NULL;
atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
}
return mr;
}
EXPORT_SYMBOL(ib_get_dma_mr);
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
- return mr->device->query_mr ?
- mr->device->query_mr(mr, mr_attr) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_mr);
-
int ib_dereg_mr(struct ib_mr *mr)
{
- struct ib_pd *pd;
+ struct ib_pd *pd = mr->pd;
int ret;
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- pd = mr->pd;
ret = mr->device->dereg_mr(mr);
if (!ret)
atomic_dec(&pd->usecnt);
@@ -1273,49 +1389,12 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
mr->pd = pd;
mr->uobject = NULL;
atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
}
return mr;
}
EXPORT_SYMBOL(ib_alloc_mr);
-/* Memory windows */
-
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
- struct ib_mw *mw;
-
- if (!pd->device->alloc_mw)
- return ERR_PTR(-ENOSYS);
-
- mw = pd->device->alloc_mw(pd, type);
- if (!IS_ERR(mw)) {
- mw->device = pd->device;
- mw->pd = pd;
- mw->uobject = NULL;
- mw->type = type;
- atomic_inc(&pd->usecnt);
- }
-
- return mw;
-}
-EXPORT_SYMBOL(ib_alloc_mw);
-
-int ib_dealloc_mw(struct ib_mw *mw)
-{
- struct ib_pd *pd;
- int ret;
-
- pd = mw->pd;
- ret = mw->device->dealloc_mw(mw);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_dealloc_mw);
-
/* "Fast" memory regions */
struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
@@ -1530,7 +1609,7 @@ int ib_sg_to_pages(struct ib_mr *mr,
int (*set_page)(struct ib_mr *, u64))
{
struct scatterlist *sg;
- u64 last_end_dma_addr = 0, last_page_addr = 0;
+ u64 last_end_dma_addr = 0;
unsigned int last_page_off = 0;
u64 page_mask = ~((u64)mr->page_size - 1);
int i, ret;
@@ -1572,7 +1651,6 @@ next_page:
mr->length += dma_len;
last_end_dma_addr = end_dma_addr;
- last_page_addr = end_dma_addr & page_mask;
last_page_off = end_dma_addr & ~page_mask;
}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index cb78b1e..f504ba7 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -149,7 +149,7 @@ static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_en
error = l2t_send(tdev, skb, l2e);
if (error < 0)
kfree_skb(skb);
- return error;
+ return error < 0 ? error : 0;
}
int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
@@ -165,7 +165,7 @@ int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
error = cxgb3_ofld_send(tdev, skb);
if (error < 0)
kfree_skb(skb);
- return error;
+ return error < 0 ? error : 0;
}
static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
index cfe4049..97fbfd2 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cq.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c
@@ -115,10 +115,6 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
case T3_SEND_WITH_SE_INV:
wc->opcode = IB_WC_SEND;
break;
- case T3_BIND_MW:
- wc->opcode = IB_WC_BIND_MW;
- break;
-
case T3_LOCAL_INV:
wc->opcode = IB_WC_LOCAL_INV;
break;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
index 5c36ee2..1d04c87 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_mem.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c
@@ -75,37 +75,6 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
return ret;
}
-int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
- struct iwch_mr *mhp,
- int shift,
- int npages)
-{
- u32 stag;
- int ret;
-
- /* We could support this... */
- if (npages > mhp->attr.pbl_size)
- return -ENOMEM;
-
- stag = mhp->attr.stag;
- if (cxio_reregister_phys_mem(&rhp->rdev,
- &stag, mhp->attr.pdid,
- mhp->attr.perms,
- mhp->attr.zbva,
- mhp->attr.va_fbo,
- mhp->attr.len,
- shift - 12,
- mhp->attr.pbl_size, mhp->attr.pbl_addr))
- return -ENOMEM;
-
- ret = iwch_finish_mem_reg(mhp, stag);
- if (ret)
- cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
- mhp->attr.pbl_addr);
-
- return ret;
-}
-
int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
{
mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
@@ -130,74 +99,3 @@ int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
return cxio_write_pbl(&mhp->rhp->rdev, pages,
mhp->attr.pbl_addr + (offset << 3), npages);
}
-
-int build_phys_page_list(struct ib_phys_buf *buffer_list,
- int num_phys_buf,
- u64 *iova_start,
- u64 *total_size,
- int *npages,
- int *shift,
- __be64 **page_list)
-{
- u64 mask;
- int i, j, n;
-
- mask = 0;
- *total_size = 0;
- for (i = 0; i < num_phys_buf; ++i) {
- if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
- return -EINVAL;
- if (i != 0 && i != num_phys_buf - 1 &&
- (buffer_list[i].size & ~PAGE_MASK))
- return -EINVAL;
- *total_size += buffer_list[i].size;
- if (i > 0)
- mask |= buffer_list[i].addr;
- else
- mask |= buffer_list[i].addr & PAGE_MASK;
- if (i != num_phys_buf - 1)
- mask |= buffer_list[i].addr + buffer_list[i].size;
- else
- mask |= (buffer_list[i].addr + buffer_list[i].size +
- PAGE_SIZE - 1) & PAGE_MASK;
- }
-
- if (*total_size > 0xFFFFFFFFULL)
- return -ENOMEM;
-
- /* Find largest page shift we can use to cover buffers */
- for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
- if ((1ULL << *shift) & mask)
- break;
-
- buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
- buffer_list[0].addr &= ~0ull << *shift;
-
- *npages = 0;
- for (i = 0; i < num_phys_buf; ++i)
- *npages += (buffer_list[i].size +
- (1ULL << *shift) - 1) >> *shift;
-
- if (!*npages)
- return -EINVAL;
-
- *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
- if (!*page_list)
- return -ENOMEM;
-
- n = 0;
- for (i = 0; i < num_phys_buf; ++i)
- for (j = 0;
- j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
- ++j)
- (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
- ((u64) j << *shift));
-
- PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
- __func__, (unsigned long long) *iova_start,
- (unsigned long long) mask, *shift, (unsigned long long) *total_size,
- *npages);
-
- return 0;
-
-}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index c34725c..2734820 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -458,9 +458,6 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
u32 mmid;
PDBG("%s ib_mr %p\n", __func__, ib_mr);
- /* There can be no memory windows */
- if (atomic_read(&ib_mr->usecnt))
- return -EINVAL;
mhp = to_iwch_mr(ib_mr);
kfree(mhp->pages);
@@ -479,24 +476,25 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
return 0;
}
-static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf,
- int acc,
- u64 *iova_start)
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
{
- __be64 *page_list;
- int shift;
- u64 total_size;
- int npages;
- struct iwch_dev *rhp;
- struct iwch_pd *php;
+ const u64 total_size = 0xffffffff;
+ const u64 mask = (total_size + PAGE_SIZE - 1) & PAGE_MASK;
+ struct iwch_pd *php = to_iwch_pd(pd);
+ struct iwch_dev *rhp = php->rhp;
struct iwch_mr *mhp;
- int ret;
+ __be64 *page_list;
+ int shift = 26, npages, ret, i;
PDBG("%s ib_pd %p\n", __func__, pd);
- php = to_iwch_pd(pd);
- rhp = php->rhp;
+
+ /*
+ * T3 only supports 32 bits of size.
+ */
+ if (sizeof(phys_addr_t) > 4) {
+ pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n");
+ return ERR_PTR(-ENOTSUPP);
+ }
mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
if (!mhp)
@@ -504,22 +502,23 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
mhp->rhp = rhp;
- /* First check that we have enough alignment */
- if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+ npages = (total_size + (1ULL << shift) - 1) >> shift;
+ if (!npages) {
ret = -EINVAL;
goto err;
}
- if (num_phys_buf > 1 &&
- ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
- ret = -EINVAL;
+ page_list = kmalloc_array(npages, sizeof(u64), GFP_KERNEL);
+ if (!page_list) {
+ ret = -ENOMEM;
goto err;
}
- ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
- &total_size, &npages, &shift, &page_list);
- if (ret)
- goto err;
+ for (i = 0; i < npages; i++)
+ page_list[i] = cpu_to_be64((u64)i << shift);
+
+ PDBG("%s mask 0x%llx shift %d len %lld pbl_size %d\n",
+ __func__, mask, shift, total_size, npages);
ret = iwch_alloc_pbl(mhp, npages);
if (ret) {
@@ -536,7 +535,7 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
mhp->attr.zbva = 0;
mhp->attr.perms = iwch_ib_to_tpt_access(acc);
- mhp->attr.va_fbo = *iova_start;
+ mhp->attr.va_fbo = 0;
mhp->attr.page_size = shift - 12;
mhp->attr.len = (u32) total_size;
@@ -553,76 +552,8 @@ err_pbl:
err:
kfree(mhp);
return ERR_PTR(ret);
-
-}
-
-static int iwch_reregister_phys_mem(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf,
- int acc, u64 * iova_start)
-{
-
- struct iwch_mr mh, *mhp;
- struct iwch_pd *php;
- struct iwch_dev *rhp;
- __be64 *page_list = NULL;
- int shift = 0;
- u64 total_size;
- int npages = 0;
- int ret;
-
- PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
-
- /* There can be no memory windows */
- if (atomic_read(&mr->usecnt))
- return -EINVAL;
-
- mhp = to_iwch_mr(mr);
- rhp = mhp->rhp;
- php = to_iwch_pd(mr->pd);
-
- /* make sure we are on the same adapter */
- if (rhp != php->rhp)
- return -EINVAL;
-
- memcpy(&mh, mhp, sizeof *mhp);
-
- if (mr_rereg_mask & IB_MR_REREG_PD)
- php = to_iwch_pd(pd);
- if (mr_rereg_mask & IB_MR_REREG_ACCESS)
- mh.attr.perms = iwch_ib_to_tpt_access(acc);
- if (mr_rereg_mask & IB_MR_REREG_TRANS) {
- ret = build_phys_page_list(buffer_list, num_phys_buf,
- iova_start,
- &total_size, &npages,
- &shift, &page_list);
- if (ret)
- return ret;
- }
-
- ret = iwch_reregister_mem(rhp, php, &mh, shift, npages);
- kfree(page_list);
- if (ret) {
- return ret;
- }
- if (mr_rereg_mask & IB_MR_REREG_PD)
- mhp->attr.pdid = php->pdid;
- if (mr_rereg_mask & IB_MR_REREG_ACCESS)
- mhp->attr.perms = iwch_ib_to_tpt_access(acc);
- if (mr_rereg_mask & IB_MR_REREG_TRANS) {
- mhp->attr.zbva = 0;
- mhp->attr.va_fbo = *iova_start;
- mhp->attr.page_size = shift - 12;
- mhp->attr.len = (u32) total_size;
- mhp->attr.pbl_size = npages;
- }
-
- return 0;
}
-
static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
@@ -726,28 +657,6 @@ err:
return ERR_PTR(err);
}
-static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
-{
- struct ib_phys_buf bl;
- u64 kva;
- struct ib_mr *ibmr;
-
- PDBG("%s ib_pd %p\n", __func__, pd);
-
- /*
- * T3 only supports 32 bits of size.
- */
- if (sizeof(phys_addr_t) > 4) {
- pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n");
- return ERR_PTR(-ENOTSUPP);
- }
- bl.size = 0xffffffff;
- bl.addr = 0;
- kva = 0;
- ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva);
- return ibmr;
-}
-
static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
{
struct iwch_dev *rhp;
@@ -1452,12 +1361,9 @@ int iwch_register_device(struct iwch_dev *dev)
dev->ibdev.resize_cq = iwch_resize_cq;
dev->ibdev.poll_cq = iwch_poll_cq;
dev->ibdev.get_dma_mr = iwch_get_dma_mr;
- dev->ibdev.reg_phys_mr = iwch_register_phys_mem;
- dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem;
dev->ibdev.reg_user_mr = iwch_reg_user_mr;
dev->ibdev.dereg_mr = iwch_dereg_mr;
dev->ibdev.alloc_mw = iwch_alloc_mw;
- dev->ibdev.bind_mw = iwch_bind_mw;
dev->ibdev.dealloc_mw = iwch_dealloc_mw;
dev->ibdev.alloc_mr = iwch_alloc_mr;
dev->ibdev.map_mr_sg = iwch_map_mr_sg;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
index 2ac85b8..252c464 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -330,9 +330,6 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
struct ib_send_wr **bad_wr);
int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct ib_recv_wr **bad_wr);
-int iwch_bind_mw(struct ib_qp *qp,
- struct ib_mw *mw,
- struct ib_mw_bind *mw_bind);
int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
int iwch_post_zb_read(struct iwch_ep *ep);
@@ -341,21 +338,9 @@ void iwch_unregister_device(struct iwch_dev *dev);
void stop_read_rep_timer(struct iwch_qp *qhp);
int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
struct iwch_mr *mhp, int shift);
-int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
- struct iwch_mr *mhp,
- int shift,
- int npages);
int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
void iwch_free_pbl(struct iwch_mr *mhp);
int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
-int build_phys_page_list(struct ib_phys_buf *buffer_list,
- int num_phys_buf,
- u64 *iova_start,
- u64 *total_size,
- int *npages,
- int *shift,
- __be64 **page_list);
-
#define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index d0548fc..d939980 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -526,88 +526,6 @@ out:
return err;
}
-int iwch_bind_mw(struct ib_qp *qp,
- struct ib_mw *mw,
- struct ib_mw_bind *mw_bind)
-{
- struct iwch_dev *rhp;
- struct iwch_mw *mhp;
- struct iwch_qp *qhp;
- union t3_wr *wqe;
- u32 pbl_addr;
- u8 page_size;
- u32 num_wrs;
- unsigned long flag;
- struct ib_sge sgl;
- int err=0;
- enum t3_wr_flags t3_wr_flags;
- u32 idx;
- struct t3_swsq *sqp;
-
- qhp = to_iwch_qp(qp);
- mhp = to_iwch_mw(mw);
- rhp = qhp->rhp;
-
- spin_lock_irqsave(&qhp->lock, flag);
- if (qhp->attr.state > IWCH_QP_STATE_RTS) {
- spin_unlock_irqrestore(&qhp->lock, flag);
- return -EINVAL;
- }
- num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
- qhp->wq.sq_size_log2);
- if (num_wrs == 0) {
- spin_unlock_irqrestore(&qhp->lock, flag);
- return -ENOMEM;
- }
- idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
- PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx,
- mw, mw_bind);
- wqe = (union t3_wr *) (qhp->wq.queue + idx);
-
- t3_wr_flags = 0;
- if (mw_bind->send_flags & IB_SEND_SIGNALED)
- t3_wr_flags = T3_COMPLETION_FLAG;
-
- sgl.addr = mw_bind->bind_info.addr;
- sgl.lkey = mw_bind->bind_info.mr->lkey;
- sgl.length = mw_bind->bind_info.length;
- wqe->bind.reserved = 0;
- wqe->bind.type = TPT_VATO;
-
- /* TBD: check perms */
- wqe->bind.perms = iwch_ib_to_tpt_bind_access(
- mw_bind->bind_info.mw_access_flags);
- wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey);
- wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
- wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length);
- wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr);
- err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
- if (err) {
- spin_unlock_irqrestore(&qhp->lock, flag);
- return err;
- }
- wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
- sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
- sqp->wr_id = mw_bind->wr_id;
- sqp->opcode = T3_BIND_MW;
- sqp->sq_wptr = qhp->wq.sq_wptr;
- sqp->complete = 0;
- sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
- wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
- wqe->bind.mr_pagesz = page_size;
- build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
- Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
- sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP);
- ++(qhp->wq.wptr);
- ++(qhp->wq.sq_wptr);
- spin_unlock_irqrestore(&qhp->lock, flag);
-
- if (cxio_wq_db_enabled(&qhp->wq))
- ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
-
- return err;
-}
-
static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
u8 *layer_type, u8 *ecode)
{
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 326d07d..cd2ff5f 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3271,6 +3271,12 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
&ep->com.mapped_local_addr;
+ if (ipv6_addr_type(&sin6->sin6_addr) != IPV6_ADDR_ANY) {
+ err = cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+ (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+ if (err)
+ return err;
+ }
c4iw_init_wr_wait(&ep->com.wr_wait);
err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0],
ep->stid, &sin6->sin6_addr,
@@ -3282,13 +3288,13 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
0, 0, __func__);
else if (err > 0)
err = net_xmit_errno(err);
- if (err)
+ if (err) {
+ cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+ (const u32 *)&sin6->sin6_addr.s6_addr, 1);
pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
err, ep->stid,
sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
- else
- cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
- (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+ }
return err;
}
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index de9cd69..cf21df4 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -744,9 +744,6 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
case FW_RI_SEND_WITH_SE:
wc->opcode = IB_WC_SEND;
break;
- case FW_RI_BIND_MW:
- wc->opcode = IB_WC_BIND_MW;
- break;
case FW_RI_LOCAL_INV:
wc->opcode = IB_WC_LOCAL_INV;
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 58fce174..8024ea4 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -315,14 +315,12 @@ static int qp_release(struct inode *inode, struct file *file)
static int qp_open(struct inode *inode, struct file *file)
{
struct c4iw_debugfs_data *qpd;
- int ret = 0;
int count = 1;
qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
- if (!qpd) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!qpd)
+ return -ENOMEM;
+
qpd->devp = inode->i_private;
qpd->pos = 0;
@@ -333,8 +331,8 @@ static int qp_open(struct inode *inode, struct file *file)
qpd->bufsize = count * 128;
qpd->buf = vmalloc(qpd->bufsize);
if (!qpd->buf) {
- ret = -ENOMEM;
- goto err1;
+ kfree(qpd);
+ return -ENOMEM;
}
spin_lock_irq(&qpd->devp->lock);
@@ -343,11 +341,7 @@ static int qp_open(struct inode *inode, struct file *file)
qpd->buf[qpd->pos++] = 0;
file->private_data = qpd;
- goto out;
-err1:
- kfree(qpd);
-out:
- return ret;
+ return 0;
}
static const struct file_operations qp_debugfs_fops = {
@@ -781,8 +775,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
pr_err(MOD "%s: unsupported udb/ucq densities %u/%u\n",
pci_name(rdev->lldi.pdev), rdev->lldi.udb_density,
rdev->lldi.ucq_density);
- err = -EINVAL;
- goto err1;
+ return -EINVAL;
}
if (rdev->lldi.vr->qp.start != rdev->lldi.vr->cq.start ||
rdev->lldi.vr->qp.size != rdev->lldi.vr->cq.size) {
@@ -791,8 +784,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
pci_name(rdev->lldi.pdev), rdev->lldi.vr->qp.start,
rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.size,
rdev->lldi.vr->cq.size);
- err = -EINVAL;
- goto err1;
+ return -EINVAL;
}
rdev->qpmask = rdev->lldi.udb_density - 1;
@@ -816,10 +808,8 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
rdev->lldi.db_reg, rdev->lldi.gts_reg,
rdev->qpmask, rdev->cqmask);
- if (c4iw_num_stags(rdev) == 0) {
- err = -EINVAL;
- goto err1;
- }
+ if (c4iw_num_stags(rdev) == 0)
+ return -EINVAL;
rdev->stats.pd.total = T4_MAX_NUM_PD;
rdev->stats.stag.total = rdev->lldi.vr->stag.size;
@@ -831,29 +821,31 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD);
if (err) {
printk(KERN_ERR MOD "error %d initializing resources\n", err);
- goto err1;
+ return err;
}
err = c4iw_pblpool_create(rdev);
if (err) {
printk(KERN_ERR MOD "error %d initializing pbl pool\n", err);
- goto err2;
+ goto destroy_resource;
}
err = c4iw_rqtpool_create(rdev);
if (err) {
printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
- goto err3;
+ goto destroy_pblpool;
}
err = c4iw_ocqp_pool_create(rdev);
if (err) {
printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
- goto err4;
+ goto destroy_rqtpool;
}
rdev->status_page = (struct t4_dev_status_page *)
__get_free_page(GFP_KERNEL);
- if (!rdev->status_page) {
- pr_err(MOD "error allocating status page\n");
- goto err4;
- }
+ if (!rdev->status_page)
+ goto destroy_ocqp_pool;
+ rdev->status_page->qp_start = rdev->lldi.vr->qp.start;
+ rdev->status_page->qp_size = rdev->lldi.vr->qp.size;
+ rdev->status_page->cq_start = rdev->lldi.vr->cq.start;
+ rdev->status_page->cq_size = rdev->lldi.vr->cq.size;
if (c4iw_wr_log) {
rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) *
@@ -869,13 +861,14 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
rdev->status_page->db_off = 0;
return 0;
-err4:
+destroy_ocqp_pool:
+ c4iw_ocqp_pool_destroy(rdev);
+destroy_rqtpool:
c4iw_rqtpool_destroy(rdev);
-err3:
+destroy_pblpool:
c4iw_pblpool_destroy(rdev);
-err2:
+destroy_resource:
c4iw_destroy_resource(&rdev->resource);
-err1:
return err;
}
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 00e55fa..fb2de75 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -947,8 +947,6 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
struct ib_send_wr **bad_wr);
int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct ib_recv_wr **bad_wr);
-int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
- struct ib_mw_bind *mw_bind);
int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog);
int c4iw_destroy_listen(struct iw_cm_id *cm_id);
@@ -968,17 +966,6 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
u64 length, u64 virt, int acc,
struct ib_udata *udata);
struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
-struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf,
- int acc,
- u64 *iova_start);
-int c4iw_reregister_phys_mem(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf,
- int acc, u64 *iova_start);
int c4iw_dereg_mr(struct ib_mr *ib_mr);
int c4iw_destroy_cq(struct ib_cq *ib_cq);
struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index e1629ab..7849890 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -392,32 +392,6 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
return ret;
}
-static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
- struct c4iw_mr *mhp, int shift, int npages)
-{
- u32 stag;
- int ret;
-
- if (npages > mhp->attr.pbl_size)
- return -ENOMEM;
-
- stag = mhp->attr.stag;
- ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
- FW_RI_STAG_NSMR, mhp->attr.perms,
- mhp->attr.mw_bind_enable, mhp->attr.zbva,
- mhp->attr.va_fbo, mhp->attr.len, shift - 12,
- mhp->attr.pbl_size, mhp->attr.pbl_addr);
- if (ret)
- return ret;
-
- ret = finish_mem_reg(mhp, stag);
- if (ret)
- dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
- mhp->attr.pbl_addr);
-
- return ret;
-}
-
static int alloc_pbl(struct c4iw_mr *mhp, int npages)
{
mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev,
@@ -431,228 +405,6 @@ static int alloc_pbl(struct c4iw_mr *mhp, int npages)
return 0;
}
-static int build_phys_page_list(struct ib_phys_buf *buffer_list,
- int num_phys_buf, u64 *iova_start,
- u64 *total_size, int *npages,
- int *shift, __be64 **page_list)
-{
- u64 mask;
- int i, j, n;
-
- mask = 0;
- *total_size = 0;
- for (i = 0; i < num_phys_buf; ++i) {
- if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
- return -EINVAL;
- if (i != 0 && i != num_phys_buf - 1 &&
- (buffer_list[i].size & ~PAGE_MASK))
- return -EINVAL;
- *total_size += buffer_list[i].size;
- if (i > 0)
- mask |= buffer_list[i].addr;
- else
- mask |= buffer_list[i].addr & PAGE_MASK;
- if (i != num_phys_buf - 1)
- mask |= buffer_list[i].addr + buffer_list[i].size;
- else
- mask |= (buffer_list[i].addr + buffer_list[i].size +
- PAGE_SIZE - 1) & PAGE_MASK;
- }
-
- if (*total_size > 0xFFFFFFFFULL)
- return -ENOMEM;
-
- /* Find largest page shift we can use to cover buffers */
- for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
- if ((1ULL << *shift) & mask)
- break;
-
- buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
- buffer_list[0].addr &= ~0ull << *shift;
-
- *npages = 0;
- for (i = 0; i < num_phys_buf; ++i)
- *npages += (buffer_list[i].size +
- (1ULL << *shift) - 1) >> *shift;
-
- if (!*npages)
- return -EINVAL;
-
- *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
- if (!*page_list)
- return -ENOMEM;
-
- n = 0;
- for (i = 0; i < num_phys_buf; ++i)
- for (j = 0;
- j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
- ++j)
- (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
- ((u64) j << *shift));
-
- PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
- __func__, (unsigned long long)*iova_start,
- (unsigned long long)mask, *shift, (unsigned long long)*total_size,
- *npages);
-
- return 0;
-
-}
-
-int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
- struct ib_pd *pd, struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start)
-{
-
- struct c4iw_mr mh, *mhp;
- struct c4iw_pd *php;
- struct c4iw_dev *rhp;
- __be64 *page_list = NULL;
- int shift = 0;
- u64 total_size;
- int npages;
- int ret;
-
- PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
-
- /* There can be no memory windows */
- if (atomic_read(&mr->usecnt))
- return -EINVAL;
-
- mhp = to_c4iw_mr(mr);
- rhp = mhp->rhp;
- php = to_c4iw_pd(mr->pd);
-
- /* make sure we are on the same adapter */
- if (rhp != php->rhp)
- return -EINVAL;
-
- memcpy(&mh, mhp, sizeof *mhp);
-
- if (mr_rereg_mask & IB_MR_REREG_PD)
- php = to_c4iw_pd(pd);
- if (mr_rereg_mask & IB_MR_REREG_ACCESS) {
- mh.attr.perms = c4iw_ib_to_tpt_access(acc);
- mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) ==
- IB_ACCESS_MW_BIND;
- }
- if (mr_rereg_mask & IB_MR_REREG_TRANS) {
- ret = build_phys_page_list(buffer_list, num_phys_buf,
- iova_start,
- &total_size, &npages,
- &shift, &page_list);
- if (ret)
- return ret;
- }
-
- if (mr_exceeds_hw_limits(rhp, total_size)) {
- kfree(page_list);
- return -EINVAL;
- }
-
- ret = reregister_mem(rhp, php, &mh, shift, npages);
- kfree(page_list);
- if (ret)
- return ret;
- if (mr_rereg_mask & IB_MR_REREG_PD)
- mhp->attr.pdid = php->pdid;
- if (mr_rereg_mask & IB_MR_REREG_ACCESS)
- mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
- if (mr_rereg_mask & IB_MR_REREG_TRANS) {
- mhp->attr.zbva = 0;
- mhp->attr.va_fbo = *iova_start;
- mhp->attr.page_size = shift - 12;
- mhp->attr.len = (u32) total_size;
- mhp->attr.pbl_size = npages;
- }
-
- return 0;
-}
-
-struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start)
-{
- __be64 *page_list;
- int shift;
- u64 total_size;
- int npages;
- struct c4iw_dev *rhp;
- struct c4iw_pd *php;
- struct c4iw_mr *mhp;
- int ret;
-
- PDBG("%s ib_pd %p\n", __func__, pd);
- php = to_c4iw_pd(pd);
- rhp = php->rhp;
-
- mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
- if (!mhp)
- return ERR_PTR(-ENOMEM);
-
- mhp->rhp = rhp;
-
- /* First check that we have enough alignment */
- if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
- ret = -EINVAL;
- goto err;
- }
-
- if (num_phys_buf > 1 &&
- ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
- &total_size, &npages, &shift,
- &page_list);
- if (ret)
- goto err;
-
- if (mr_exceeds_hw_limits(rhp, total_size)) {
- kfree(page_list);
- ret = -EINVAL;
- goto err;
- }
-
- ret = alloc_pbl(mhp, npages);
- if (ret) {
- kfree(page_list);
- goto err;
- }
-
- ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr,
- npages);
- kfree(page_list);
- if (ret)
- goto err_pbl;
-
- mhp->attr.pdid = php->pdid;
- mhp->attr.zbva = 0;
-
- mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
- mhp->attr.va_fbo = *iova_start;
- mhp->attr.page_size = shift - 12;
-
- mhp->attr.len = (u32) total_size;
- mhp->attr.pbl_size = npages;
- ret = register_mem(rhp, php, mhp, shift);
- if (ret)
- goto err_pbl;
-
- return &mhp->ibmr;
-
-err_pbl:
- c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
- mhp->attr.pbl_size << 3);
-
-err:
- kfree(mhp);
- return ERR_PTR(ret);
-
-}
-
struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
{
struct c4iw_dev *rhp;
@@ -952,9 +704,6 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
u32 mmid;
PDBG("%s ib_mr %p\n", __func__, ib_mr);
- /* There can be no memory windows */
- if (atomic_read(&ib_mr->usecnt))
- return -EINVAL;
mhp = to_c4iw_mr(ib_mr);
rhp = mhp->rhp;
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 0a7d998..ec04272 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -549,12 +549,9 @@ int c4iw_register_device(struct c4iw_dev *dev)
dev->ibdev.resize_cq = c4iw_resize_cq;
dev->ibdev.poll_cq = c4iw_poll_cq;
dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
- dev->ibdev.reg_phys_mr = c4iw_register_phys_mem;
- dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem;
dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
dev->ibdev.dereg_mr = c4iw_dereg_mr;
dev->ibdev.alloc_mw = c4iw_alloc_mw;
- dev->ibdev.bind_mw = c4iw_bind_mw;
dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
dev->ibdev.alloc_mr = c4iw_alloc_mr;
dev->ibdev.map_mr_sg = c4iw_map_mr_sg;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index aa515af..e99345e 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -933,11 +933,6 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
return err;
}
-int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind)
-{
- return -ENOSYS;
-}
-
static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
u8 *ecode)
{
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 1092a2d..6126bbe 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -699,4 +699,11 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq)
struct t4_dev_status_page {
u8 db_off;
+ u8 pad1;
+ u16 pad2;
+ u32 pad3;
+ u64 qp_start;
+ u64 qp_size;
+ u64 cq_start;
+ u64 cq_size;
};
diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h
index cbd0ce1..295f422 100644
--- a/drivers/infiniband/hw/cxgb4/user.h
+++ b/drivers/infiniband/hw/cxgb4/user.h
@@ -32,7 +32,7 @@
#ifndef __C4IW_USER_H__
#define __C4IW_USER_H__
-#define C4IW_UVERBS_ABI_VERSION 2
+#define C4IW_UVERBS_ABI_VERSION 3
/*
* Make sure that all structs defined in this file remain laid out so
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index 86af713..105246f 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -92,7 +92,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
ah_attr->grh.sgid_index, &sgid, &gid_attr);
if (ret)
return ERR_PTR(ret);
- memset(ah->av.eth.s_mac, 0, ETH_ALEN);
+ eth_zero_addr(ah->av.eth.s_mac);
if (gid_attr.ndev) {
if (is_vlan_dev(gid_attr.ndev))
vlan_tag = vlan_dev_vlan_id(gid_attr.ndev);
@@ -104,6 +104,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index);
ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+ ah->av.eth.hop_limit = ah_attr->grh.hop_limit;
if (ah_attr->static_rate) {
ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index b88fc8f..9f8b516 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -811,9 +811,6 @@ repoll:
wc->opcode = IB_WC_MASKED_FETCH_ADD;
wc->byte_len = 8;
break;
- case MLX4_OPCODE_BIND_MW:
- wc->opcode = IB_WC_BIND_MW;
- break;
case MLX4_OPCODE_LSO:
wc->opcode = IB_WC_LSO;
break;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 97d6878..1c7ab6c 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -154,9 +154,9 @@ static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_n
return dev;
}
-static int mlx4_ib_update_gids(struct gid_entry *gids,
- struct mlx4_ib_dev *ibdev,
- u8 port_num)
+static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
+ struct mlx4_ib_dev *ibdev,
+ u8 port_num)
{
struct mlx4_cmd_mailbox *mailbox;
int err;
@@ -187,6 +187,63 @@ static int mlx4_ib_update_gids(struct gid_entry *gids,
return err;
}
+static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
+ struct mlx4_ib_dev *ibdev,
+ u8 port_num)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+ struct mlx4_dev *dev = ibdev->dev;
+ int i;
+ struct {
+ union ib_gid gid;
+ __be32 rsrvd1[2];
+ __be16 rsrvd2;
+ u8 type;
+ u8 version;
+ __be32 rsrvd3;
+ } *gid_tbl;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return -ENOMEM;
+
+ gid_tbl = mailbox->buf;
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+ memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid));
+ if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+ gid_tbl[i].version = 2;
+ if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid))
+ gid_tbl[i].type = 1;
+ else
+ memset(&gid_tbl[i].gid, 0, 12);
+ }
+ }
+
+ err = mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_ROCE_ADDR << 8 | port_num,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+ if (mlx4_is_bonded(dev))
+ err += mlx4_cmd(dev, mailbox->dma,
+ MLX4_SET_PORT_ROCE_ADDR << 8 | 2,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+ struct mlx4_ib_dev *ibdev,
+ u8 port_num)
+{
+ if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+ return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num);
+
+ return mlx4_ib_update_gids_v1(gids, ibdev, port_num);
+}
+
static int mlx4_ib_add_gid(struct ib_device *device,
u8 port_num,
unsigned int index,
@@ -215,7 +272,8 @@ static int mlx4_ib_add_gid(struct ib_device *device,
port_gid_table = &iboe->gids[port_num - 1];
spin_lock_bh(&iboe->lock);
for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
- if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+ if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) &&
+ (port_gid_table->gids[i].gid_type == attr->gid_type)) {
found = i;
break;
}
@@ -233,6 +291,7 @@ static int mlx4_ib_add_gid(struct ib_device *device,
} else {
*context = port_gid_table->gids[free].ctx;
memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+ port_gid_table->gids[free].gid_type = attr->gid_type;
port_gid_table->gids[free].ctx->real_index = free;
port_gid_table->gids[free].ctx->refcount = 1;
hw_update = 1;
@@ -248,8 +307,10 @@ static int mlx4_ib_add_gid(struct ib_device *device,
if (!gids) {
ret = -ENOMEM;
} else {
- for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+ gids[i].gid_type = port_gid_table->gids[i].gid_type;
+ }
}
}
spin_unlock_bh(&iboe->lock);
@@ -325,6 +386,7 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
int i;
int ret;
unsigned long flags;
+ struct ib_gid_attr attr;
if (port_num > MLX4_MAX_PORTS)
return -EINVAL;
@@ -335,10 +397,13 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
return index;
- ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, NULL);
+ ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr);
if (ret)
return ret;
+ if (attr.ndev)
+ dev_put(attr.ndev);
+
if (!memcmp(&gid, &zgid, sizeof(gid)))
return -EINVAL;
@@ -346,7 +411,8 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
port_gid_table = &iboe->gids[port_num - 1];
for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
- if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) {
+ if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) &&
+ attr.gid_type == port_gid_table->gids[i].gid_type) {
ctx = port_gid_table->gids[i].ctx;
break;
}
@@ -2119,6 +2185,7 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
struct ib_port_immutable *immutable)
{
struct ib_port_attr attr;
+ struct mlx4_ib_dev *mdev = to_mdev(ibdev);
int err;
err = mlx4_ib_query_port(ibdev, port_num, &attr);
@@ -2128,10 +2195,15 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
- if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND)
+ if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) {
immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
- else
- immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+ } else {
+ if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+ if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+ RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+ }
immutable->max_mad_size = IB_MGMT_MAD_SIZE;
@@ -2283,7 +2355,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
- ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
ibdev->ib_dev.uverbs_cmd_mask |=
@@ -2423,7 +2494,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
if (mlx4_ib_init_sriov(ibdev))
goto err_mad;
- if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE ||
+ dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
if (!iboe->nb.notifier_call) {
iboe->nb.notifier_call = mlx4_ib_netdev_event;
err = register_netdevice_notifier(&iboe->nb);
@@ -2432,6 +2504,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
goto err_notif;
}
}
+ if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+ err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
+ if (err) {
+ goto err_notif;
+ }
+ }
}
for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 1caa11e..52ce7b0 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -177,11 +177,18 @@ struct mlx4_ib_wq {
unsigned tail;
};
+enum {
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START
+};
+
enum mlx4_ib_qp_flags {
MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
+
+ /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */
+ MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI,
MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
MLX4_IB_SRIOV_SQP = 1 << 31,
};
@@ -478,6 +485,7 @@ struct gid_cache_context {
struct gid_entry {
union ib_gid gid;
+ enum ib_gid_type gid_type;
struct gid_cache_context *ctx;
};
@@ -704,8 +712,6 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
struct ib_udata *udata);
int mlx4_ib_dereg_mr(struct ib_mr *mr);
struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
- struct ib_mw_bind *mw_bind);
int mlx4_ib_dealloc_mw(struct ib_mw *mw);
struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
enum ib_mr_type mr_type,
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 4d1e1c6..242b94e 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -366,28 +366,6 @@ err_free:
return ERR_PTR(err);
}
-int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
- struct ib_mw_bind *mw_bind)
-{
- struct ib_bind_mw_wr wr;
- struct ib_send_wr *bad_wr;
- int ret;
-
- memset(&wr, 0, sizeof(wr));
- wr.wr.opcode = IB_WR_BIND_MW;
- wr.wr.wr_id = mw_bind->wr_id;
- wr.wr.send_flags = mw_bind->send_flags;
- wr.mw = mw;
- wr.bind_info = mw_bind->bind_info;
- wr.rkey = ib_inc_rkey(mw->rkey);
-
- ret = mlx4_ib_post_send(qp, &wr.wr, &bad_wr);
- if (!ret)
- mw->rkey = wr.rkey;
-
- return ret;
-}
-
int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
{
struct mlx4_ib_mw *mw = to_mmw(ibmw);
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 13eaaf4..bc5536f 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -32,6 +32,8 @@
*/
#include <linux/log2.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/vmalloc.h>
@@ -85,6 +87,7 @@ struct mlx4_ib_sqp {
u32 send_psn;
struct ib_ud_header ud_header;
u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
+ struct ib_qp *roce_v2_gsi;
};
enum {
@@ -115,7 +118,6 @@ static const __be32 mlx4_ib_opcode[] = {
[IB_WR_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
[IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
- [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW),
};
static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
@@ -154,7 +156,10 @@ static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
}
}
}
- return proxy_sqp;
+ if (proxy_sqp)
+ return 1;
+
+ return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
}
/* used for INIT/CLOSE port logic */
@@ -796,11 +801,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
if (err)
goto err_mtt;
- qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+ qp->sq.wrid = kmalloc_array(qp->sq.wqe_cnt, sizeof(u64),
+ gfp | __GFP_NOWARN);
if (!qp->sq.wrid)
qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
gfp, PAGE_KERNEL);
- qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp);
+ qp->rq.wrid = kmalloc_array(qp->rq.wqe_cnt, sizeof(u64),
+ gfp | __GFP_NOWARN);
if (!qp->rq.wrid)
qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64),
gfp, PAGE_KERNEL);
@@ -1099,9 +1106,9 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
return dev->dev->caps.qp1_proxy[attr->port_num - 1];
}
-struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
- struct ib_qp_init_attr *init_attr,
- struct ib_udata *udata)
+static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
{
struct mlx4_ib_qp *qp = NULL;
int err;
@@ -1120,6 +1127,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
MLX4_IB_SRIOV_TUNNEL_QP |
MLX4_IB_SRIOV_SQP |
MLX4_IB_QP_NETIF |
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI |
MLX4_IB_QP_CREATE_USE_GFP_NOIO))
return ERR_PTR(-EINVAL);
@@ -1128,15 +1136,21 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
return ERR_PTR(-EINVAL);
}
- if (init_attr->create_flags &&
- ((udata && init_attr->create_flags & ~(sup_u_create_flags)) ||
- ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
- MLX4_IB_QP_CREATE_USE_GFP_NOIO |
- MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)) &&
- init_attr->qp_type != IB_QPT_UD) ||
- ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
- init_attr->qp_type > IB_QPT_GSI)))
- return ERR_PTR(-EINVAL);
+ if (init_attr->create_flags) {
+ if (udata && init_attr->create_flags & ~(sup_u_create_flags))
+ return ERR_PTR(-EINVAL);
+
+ if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
+ MLX4_IB_QP_CREATE_USE_GFP_NOIO |
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI |
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) &&
+ init_attr->qp_type != IB_QPT_UD) ||
+ (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
+ init_attr->qp_type > IB_QPT_GSI) ||
+ (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
+ init_attr->qp_type != IB_QPT_GSI))
+ return ERR_PTR(-EINVAL);
+ }
switch (init_attr->qp_type) {
case IB_QPT_XRC_TGT:
@@ -1173,19 +1187,29 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
case IB_QPT_SMI:
case IB_QPT_GSI:
{
+ int sqpn;
+
/* Userspace is not allowed to create special QPs: */
if (udata)
return ERR_PTR(-EINVAL);
+ if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
+ int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0);
+
+ if (res)
+ return ERR_PTR(res);
+ } else {
+ sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
+ }
err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
- get_sqp_num(to_mdev(pd->device), init_attr),
+ sqpn,
&qp, gfp);
if (err)
return ERR_PTR(err);
qp->port = init_attr->port_num;
- qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
-
+ qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
+ init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
break;
}
default:
@@ -1196,7 +1220,41 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
return &qp->ibqp;
}
-int mlx4_ib_destroy_qp(struct ib_qp *qp)
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata) {
+ struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
+ struct ib_qp *ibqp;
+ struct mlx4_ib_dev *dev = to_mdev(device);
+
+ ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
+
+ if (!IS_ERR(ibqp) &&
+ (init_attr->qp_type == IB_QPT_GSI) &&
+ !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
+ struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
+ int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
+
+ if (is_eth &&
+ dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+ init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+ sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
+
+ if (IS_ERR(sqp->roce_v2_gsi)) {
+ pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+ sqp->roce_v2_gsi = NULL;
+ } else {
+ sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
+ sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
+ }
+
+ init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+ }
+ }
+ return ibqp;
+}
+
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
{
struct mlx4_ib_dev *dev = to_mdev(qp->device);
struct mlx4_ib_qp *mqp = to_mqp(qp);
@@ -1225,6 +1283,20 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)
return 0;
}
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+ if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+ if (sqp->roce_v2_gsi)
+ ib_destroy_qp(sqp->roce_v2_gsi);
+ }
+
+ return _mlx4_ib_destroy_qp(qp);
+}
+
static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
{
switch (type) {
@@ -1507,6 +1579,24 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
return 0;
}
+enum {
+ MLX4_QPC_ROCE_MODE_1 = 0,
+ MLX4_QPC_ROCE_MODE_2 = 2,
+ MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff
+};
+
+static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
+{
+ switch (gid_type) {
+ case IB_GID_TYPE_ROCE:
+ return MLX4_QPC_ROCE_MODE_1;
+ case IB_GID_TYPE_ROCE_UDP_ENCAP:
+ return MLX4_QPC_ROCE_MODE_2;
+ default:
+ return MLX4_QPC_ROCE_MODE_UNDEFINED;
+ }
+}
+
static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask,
enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -1633,6 +1723,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
mlx4_ib_steer_qp_reg(dev, qp, 1);
steer_qp = 1;
}
+
+ if (ibqp->qp_type == IB_QPT_GSI) {
+ enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
+ IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
+ u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
+
+ context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+ }
}
if (attr_mask & IB_QP_PKEY_INDEX) {
@@ -1650,9 +1748,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
u16 vlan = 0xffff;
u8 smac[ETH_ALEN];
int status = 0;
+ int is_eth = rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
+ attr->ah_attr.ah_flags & IB_AH_GRH;
- if (rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
- attr->ah_attr.ah_flags & IB_AH_GRH) {
+ if (is_eth) {
int index = attr->ah_attr.grh.sgid_index;
status = ib_get_cached_gid(ibqp->device, port_num,
@@ -1674,6 +1773,18 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
MLX4_QP_OPTPAR_SCHED_QUEUE);
+
+ if (is_eth &&
+ (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
+ u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type);
+
+ if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) {
+ err = -EINVAL;
+ goto out;
+ }
+ context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+ }
+
}
if (attr_mask & IB_QP_TIMEOUT) {
@@ -1845,7 +1956,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
sqd_event = 0;
if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
- context->rlkey |= (1 << 4);
+ context->rlkey_roce_mode |= (1 << 4);
/*
* Before passing a kernel QP to the HW, make sure that the
@@ -2022,8 +2133,8 @@ out:
return err;
}
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
- int attr_mask, struct ib_udata *udata)
+static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
{
struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
struct mlx4_ib_qp *qp = to_mqp(ibqp);
@@ -2126,6 +2237,27 @@ out:
return err;
}
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ int ret;
+
+ ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
+
+ if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+ int err = 0;
+
+ if (sqp->roce_v2_gsi)
+ err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
+ if (err)
+ pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n",
+ err);
+ }
+ return ret;
+}
+
static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
{
int i;
@@ -2168,7 +2300,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
send_size += sizeof (struct mlx4_ib_tunnel_header);
- ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+ ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
sqp->ud_header.lrh.service_level =
@@ -2252,16 +2384,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
return 0;
}
-static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac)
-{
- int i;
-
- for (i = ETH_ALEN; i; i--) {
- dst_mac[i - 1] = src_mac & 0xff;
- src_mac >>= 8;
- }
-}
-
+#define MLX4_ROCEV2_QP1_SPORT 0xC000
static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
void *wqe, unsigned *mlx_seg_len)
{
@@ -2281,6 +2404,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
bool is_eth;
bool is_vlan = false;
bool is_grh;
+ bool is_udp = false;
+ int ip_version = 0;
send_size = 0;
for (i = 0; i < wr->wr.num_sge; ++i)
@@ -2289,6 +2414,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
is_grh = mlx4_ib_ah_grh_present(ah);
if (is_eth) {
+ struct ib_gid_attr gid_attr;
+
if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
/* When multi-function is enabled, the ib_core gid
* indexes don't necessarily match the hw ones, so
@@ -2302,19 +2429,35 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
err = ib_get_cached_gid(ib_dev,
be32_to_cpu(ah->av.ib.port_pd) >> 24,
ah->av.ib.gid_index, &sgid,
- NULL);
- if (!err && !memcmp(&sgid, &zgid, sizeof(sgid)))
- err = -ENOENT;
- if (err)
+ &gid_attr);
+ if (!err) {
+ if (gid_attr.ndev)
+ dev_put(gid_attr.ndev);
+ if (!memcmp(&sgid, &zgid, sizeof(sgid)))
+ err = -ENOENT;
+ }
+ if (!err) {
+ is_udp = gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+ if (is_udp) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
+ ip_version = 4;
+ else
+ ip_version = 6;
+ is_grh = false;
+ }
+ } else {
return err;
+ }
}
-
if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
is_vlan = 1;
}
}
- ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+ err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
+ ip_version, is_udp, 0, &sqp->ud_header);
+ if (err)
+ return err;
if (!is_eth) {
sqp->ud_header.lrh.service_level =
@@ -2323,7 +2466,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
}
- if (is_grh) {
+ if (is_grh || (ip_version == 6)) {
sqp->ud_header.grh.traffic_class =
(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
sqp->ud_header.grh.flow_label =
@@ -2352,6 +2495,25 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
ah->av.ib.dgid, 16);
}
+ if (ip_version == 4) {
+ sqp->ud_header.ip4.tos =
+ (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+ sqp->ud_header.ip4.id = 0;
+ sqp->ud_header.ip4.frag_off = htons(IP_DF);
+ sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit;
+
+ memcpy(&sqp->ud_header.ip4.saddr,
+ sgid.raw + 12, 4);
+ memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
+ sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
+ }
+
+ if (is_udp) {
+ sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
+ sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
+ sqp->ud_header.udp.csum = 0;
+ }
+
mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
if (!is_eth) {
@@ -2380,34 +2542,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
if (is_eth) {
struct in6_addr in6;
-
+ u16 ether_type;
u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
+ ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE :
+ (ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
+
mlx->sched_prio = cpu_to_be16(pcp);
+ ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
- /* FIXME: cache smac value? */
memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
memcpy(&in6, sgid.raw, sizeof(in6));
- if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
- u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]);
- u8 smac[ETH_ALEN];
-
- mlx4_u64_to_smac(smac, mac);
- memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN);
- } else {
- /* use the src mac of the tunnel */
- memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN);
- }
if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
if (!is_vlan) {
- sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+ sqp->ud_header.eth.type = cpu_to_be16(ether_type);
} else {
- sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+ sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
}
} else {
@@ -2528,25 +2683,6 @@ static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg,
fseg->reserved[1] = 0;
}
-static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg,
- struct ib_bind_mw_wr *wr)
-{
- bseg->flags1 =
- convert_access(wr->bind_info.mw_access_flags) &
- cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ |
- MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE |
- MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC);
- bseg->flags2 = 0;
- if (wr->mw->type == IB_MW_TYPE_2)
- bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2);
- if (wr->bind_info.mw_access_flags & IB_ZERO_BASED)
- bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED);
- bseg->new_rkey = cpu_to_be32(wr->rkey);
- bseg->lkey = cpu_to_be32(wr->bind_info.mr->lkey);
- bseg->addr = cpu_to_be64(wr->bind_info.addr);
- bseg->length = cpu_to_be64(wr->bind_info.length);
-}
-
static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
{
memset(iseg, 0, sizeof(*iseg));
@@ -2766,6 +2902,29 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
int i;
struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = to_msqp(qp);
+
+ if (sqp->roce_v2_gsi) {
+ struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
+ struct ib_gid_attr gid_attr;
+ union ib_gid gid;
+
+ if (!ib_get_cached_gid(ibqp->device,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &gid,
+ &gid_attr)) {
+ if (gid_attr.ndev)
+ dev_put(gid_attr.ndev);
+ qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
+ to_mqp(sqp->roce_v2_gsi) : qp;
+ } else {
+ pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
+ ah->av.ib.gid_index);
+ }
+ }
+ }
+
spin_lock_irqsave(&qp->sq.lock, flags);
if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
err = -EIO;
@@ -2867,13 +3026,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
size += sizeof(struct mlx4_wqe_fmr_seg) / 16;
break;
- case IB_WR_BIND_MW:
- ctrl->srcrb_flags |=
- cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
- set_bind_seg(wqe, bind_mw_wr(wr));
- wqe += sizeof(struct mlx4_wqe_bind_seg);
- size += sizeof(struct mlx4_wqe_bind_seg) / 16;
- break;
default:
/* No extra segments required for sends */
break;
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index c394376..0597f3e 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -171,7 +171,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
if (err)
goto err_mtt;
- srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+ srq->wrid = kmalloc_array(srq->msrq.max, sizeof(u64),
+ GFP_KERNEL | __GFP_NOWARN);
if (!srq->wrid) {
srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64),
GFP_KERNEL, PAGE_KERNEL);
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
index 6608058..745efa4 100644
--- a/drivers/infiniband/hw/mlx5/ah.c
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -32,8 +32,10 @@
#include "mlx5_ib.h"
-struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
- struct mlx5_ib_ah *ah)
+static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_ah *ah,
+ struct ib_ah_attr *ah_attr,
+ enum rdma_link_layer ll)
{
if (ah_attr->ah_flags & IB_AH_GRH) {
memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16);
@@ -44,9 +46,20 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
ah->av.tclass = ah_attr->grh.traffic_class;
}
- ah->av.rlid = cpu_to_be16(ah_attr->dlid);
- ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
- ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf);
+ ah->av.stat_rate_sl = (ah_attr->static_rate << 4);
+
+ if (ll == IB_LINK_LAYER_ETHERNET) {
+ memcpy(ah->av.rmac, ah_attr->dmac, sizeof(ah_attr->dmac));
+ ah->av.udp_sport =
+ mlx5_get_roce_udp_sport(dev,
+ ah_attr->port_num,
+ ah_attr->grh.sgid_index);
+ ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1;
+ } else {
+ ah->av.rlid = cpu_to_be16(ah_attr->dlid);
+ ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
+ ah->av.stat_rate_sl |= (ah_attr->sl & 0xf);
+ }
return &ah->ibah;
}
@@ -54,12 +67,19 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
{
struct mlx5_ib_ah *ah;
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ enum rdma_link_layer ll;
+
+ ll = pd->device->get_link_layer(pd->device, ah_attr->port_num);
+
+ if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH))
+ return ERR_PTR(-EINVAL);
ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
if (!ah)
return ERR_PTR(-ENOMEM);
- return create_ib_ah(ah_attr, ah); /* never fails */
+ return create_ib_ah(dev, ah, ah_attr, ll); /* never fails */
}
int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 92ddae1..fd1de31 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -154,9 +154,6 @@ static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
wc->opcode = IB_WC_MASKED_FETCH_ADD;
wc->byte_len = 8;
break;
- case MLX5_OPCODE_BIND_MW:
- wc->opcode = IB_WC_BIND_MW;
- break;
case MLX5_OPCODE_UMR:
wc->opcode = get_umr_comp(wq, idx);
break;
@@ -171,6 +168,7 @@ enum {
static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
struct mlx5_ib_qp *qp)
{
+ enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1);
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
struct mlx5_ib_srq *srq;
struct mlx5_ib_wq *wq;
@@ -236,6 +234,22 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
} else {
wc->pkey_index = 0;
}
+
+ if (ll != IB_LINK_LAYER_ETHERNET)
+ return;
+
+ switch (wc->sl & 0x3) {
+ case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH:
+ wc->network_hdr_type = RDMA_NETWORK_IB;
+ break;
+ case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6:
+ wc->network_hdr_type = RDMA_NETWORK_IPV6;
+ break;
+ case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4:
+ wc->network_hdr_type = RDMA_NETWORK_IPV4;
+ break;
+ }
+ wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
}
static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
@@ -760,12 +774,12 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
int eqn;
int err;
- if (attr->flags)
- return ERR_PTR(-EINVAL);
-
if (entries < 0)
return ERR_PTR(-EINVAL);
+ if (check_cq_create_flags(attr->flags))
+ return ERR_PTR(-EOPNOTSUPP);
+
entries = roundup_pow_of_two(entries + 1);
if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
return ERR_PTR(-EINVAL);
@@ -779,6 +793,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
spin_lock_init(&cq->lock);
cq->resize_buf = NULL;
cq->resize_umem = NULL;
+ cq->create_flags = attr->flags;
if (context) {
err = create_cq_user(dev, udata, context, cq, entries,
@@ -796,6 +811,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
cq->cqe_size = cqe_size;
cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+
+ if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN)
+ cqb->ctx.cqe_sz_flags |= (1 << 1);
+
cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index);
err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
if (err)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b0ec175..ec737e2 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -40,6 +40,8 @@
#include <linux/io-mapping.h>
#include <linux/sched.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include <linux/mlx5/vport.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_umem.h>
@@ -66,12 +68,14 @@ static char mlx5_version[] =
DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
+enum {
+ MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
+};
+
static enum rdma_link_layer
-mlx5_ib_port_link_layer(struct ib_device *device)
+mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
{
- struct mlx5_ib_dev *dev = to_mdev(device);
-
- switch (MLX5_CAP_GEN(dev->mdev, port_type)) {
+ switch (port_type_cap) {
case MLX5_CAP_PORT_TYPE_IB:
return IB_LINK_LAYER_INFINIBAND;
case MLX5_CAP_PORT_TYPE_ETH:
@@ -81,6 +85,202 @@ mlx5_ib_port_link_layer(struct ib_device *device)
}
}
+static enum rdma_link_layer
+mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
+
+ return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+}
+
+static int mlx5_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
+ roce.nb);
+
+ if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER))
+ return NOTIFY_DONE;
+
+ write_lock(&ibdev->roce.netdev_lock);
+ if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
+ ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev;
+ write_unlock(&ibdev->roce.netdev_lock);
+
+ return NOTIFY_DONE;
+}
+
+static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
+ u8 port_num)
+{
+ struct mlx5_ib_dev *ibdev = to_mdev(device);
+ struct net_device *ndev;
+
+ /* Ensure ndev does not disappear before we invoke dev_hold()
+ */
+ read_lock(&ibdev->roce.netdev_lock);
+ ndev = ibdev->roce.netdev;
+ if (ndev)
+ dev_hold(ndev);
+ read_unlock(&ibdev->roce.netdev_lock);
+
+ return ndev;
+}
+
+static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
+ struct ib_port_attr *props)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ struct net_device *ndev;
+ enum ib_mtu ndev_ib_mtu;
+ u16 qkey_viol_cntr;
+
+ memset(props, 0, sizeof(*props));
+
+ props->port_cap_flags |= IB_PORT_CM_SUP;
+ props->port_cap_flags |= IB_PORT_IP_BASED_GIDS;
+
+ props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev,
+ roce_address_table_size);
+ props->max_mtu = IB_MTU_4096;
+ props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
+ props->pkey_tbl_len = 1;
+ props->state = IB_PORT_DOWN;
+ props->phys_state = 3;
+
+ mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
+ props->qkey_viol_cntr = qkey_viol_cntr;
+
+ ndev = mlx5_ib_get_netdev(device, port_num);
+ if (!ndev)
+ return 0;
+
+ if (netif_running(ndev) && netif_carrier_ok(ndev)) {
+ props->state = IB_PORT_ACTIVE;
+ props->phys_state = 5;
+ }
+
+ ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
+
+ dev_put(ndev);
+
+ props->active_mtu = min(props->max_mtu, ndev_ib_mtu);
+
+ props->active_width = IB_WIDTH_4X; /* TODO */
+ props->active_speed = IB_SPEED_QDR; /* TODO */
+
+ return 0;
+}
+
+static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ void *mlx5_addr)
+{
+#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
+ char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+ source_l3_address);
+ void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+ source_mac_47_32);
+
+ if (!gid)
+ return;
+
+ ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
+
+ if (is_vlan_dev(attr->ndev)) {
+ MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
+ MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
+ }
+
+ switch (attr->gid_type) {
+ case IB_GID_TYPE_IB:
+ MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
+ break;
+ case IB_GID_TYPE_ROCE_UDP_ENCAP:
+ MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
+ break;
+
+ default:
+ WARN_ON(true);
+ }
+
+ if (attr->gid_type != IB_GID_TYPE_IB) {
+ if (ipv6_addr_v4mapped((void *)gid))
+ MLX5_SET_RA(mlx5_addr, roce_l3_type,
+ MLX5_ROCE_L3_TYPE_IPV4);
+ else
+ MLX5_SET_RA(mlx5_addr, roce_l3_type,
+ MLX5_ROCE_L3_TYPE_IPV6);
+ }
+
+ if ((attr->gid_type == IB_GID_TYPE_IB) ||
+ !ipv6_addr_v4mapped((void *)gid))
+ memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
+ else
+ memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
+}
+
+static int set_roce_addr(struct ib_device *device, u8 port_num,
+ unsigned int index,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ u32 in[MLX5_ST_SZ_DW(set_roce_address_in)];
+ u32 out[MLX5_ST_SZ_DW(set_roce_address_out)];
+ void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
+ enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
+
+ if (ll != IB_LINK_LAYER_ETHERNET)
+ return -EINVAL;
+
+ memset(in, 0, sizeof(in));
+
+ ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
+
+ MLX5_SET(set_roce_address_in, in, roce_address_index, index);
+ MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
+
+ memset(out, 0, sizeof(out));
+ return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
+ unsigned int index, const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ __always_unused void **context)
+{
+ return set_roce_addr(device, port_num, index, gid, attr);
+}
+
+static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
+ unsigned int index, __always_unused void **context)
+{
+ return set_roce_addr(device, port_num, index, NULL, NULL);
+}
+
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+ int index)
+{
+ struct ib_gid_attr attr;
+ union ib_gid gid;
+
+ if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
+ return 0;
+
+ if (!attr.ndev)
+ return 0;
+
+ dev_put(attr.ndev);
+
+ if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+ return 0;
+
+ return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
+}
+
static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
{
return !dev->mdev->issi;
@@ -97,13 +297,35 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev)
if (mlx5_use_mad_ifc(to_mdev(ibdev)))
return MLX5_VPORT_ACCESS_METHOD_MAD;
- if (mlx5_ib_port_link_layer(ibdev) ==
+ if (mlx5_ib_port_link_layer(ibdev, 1) ==
IB_LINK_LAYER_ETHERNET)
return MLX5_VPORT_ACCESS_METHOD_NIC;
return MLX5_VPORT_ACCESS_METHOD_HCA;
}
+static void get_atomic_caps(struct mlx5_ib_dev *dev,
+ struct ib_device_attr *props)
+{
+ u8 tmp;
+ u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
+ u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
+ u8 atomic_req_8B_endianness_mode =
+ MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
+
+ /* Check if HW supports 8 bytes standard atomic operations and capable
+ * of host endianness respond
+ */
+ tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
+ if (((atomic_operations & tmp) == tmp) &&
+ (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
+ (atomic_req_8B_endianness_mode)) {
+ props->atomic_cap = IB_ATOMIC_HCA;
+ } else {
+ props->atomic_cap = IB_ATOMIC_NONE;
+ }
+}
+
static int mlx5_query_system_image_guid(struct ib_device *ibdev,
__be64 *sys_image_guid)
{
@@ -119,13 +341,21 @@ static int mlx5_query_system_image_guid(struct ib_device *ibdev,
case MLX5_VPORT_ACCESS_METHOD_HCA:
err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
- if (!err)
- *sys_image_guid = cpu_to_be64(tmp);
- return err;
+ break;
+
+ case MLX5_VPORT_ACCESS_METHOD_NIC:
+ err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
+ break;
default:
return -EINVAL;
}
+
+ if (!err)
+ *sys_image_guid = cpu_to_be64(tmp);
+
+ return err;
+
}
static int mlx5_query_max_pkeys(struct ib_device *ibdev,
@@ -179,13 +409,20 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
case MLX5_VPORT_ACCESS_METHOD_HCA:
err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
- if (!err)
- *node_guid = cpu_to_be64(tmp);
- return err;
+ break;
+
+ case MLX5_VPORT_ACCESS_METHOD_NIC:
+ err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
+ break;
default:
return -EINVAL;
}
+
+ if (!err)
+ *node_guid = cpu_to_be64(tmp);
+
+ return err;
}
struct mlx5_reg_node_desc {
@@ -263,6 +500,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
if (MLX5_CAP_GEN(mdev, block_lb_mc))
props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+ if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+ (MLX5_CAP_ETH(dev->mdev, csum_cap)))
+ props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+
props->vendor_part_id = mdev->pdev->device;
props->hw_ver = mdev->pdev->revision;
@@ -278,7 +519,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->max_sge = min(max_rq_sg, max_sq_sg);
props->max_sge_rd = props->max_sge;
props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
- props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
+ props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
@@ -289,13 +530,15 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp;
props->max_srq_sge = max_rq_sg - 1;
props->max_fast_reg_page_list_len = (unsigned int)-1;
- props->atomic_cap = IB_ATOMIC_NONE;
+ get_atomic_caps(dev, props);
props->masked_atomic_cap = IB_ATOMIC_NONE;
props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
props->max_mcast_grp;
props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+ props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
+ props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (MLX5_CAP_GEN(mdev, pg))
@@ -303,6 +546,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->odp_caps = dev->odp_caps;
#endif
+ if (MLX5_CAP_GEN(mdev, cd))
+ props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
+
return 0;
}
@@ -483,6 +729,9 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
case MLX5_VPORT_ACCESS_METHOD_HCA:
return mlx5_query_hca_port(ibdev, port, props);
+ case MLX5_VPORT_ACCESS_METHOD_NIC:
+ return mlx5_query_port_roce(ibdev, port, props);
+
default:
return -EINVAL;
}
@@ -583,8 +832,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
struct ib_udata *udata)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
- struct mlx5_ib_alloc_ucontext_req_v2 req;
- struct mlx5_ib_alloc_ucontext_resp resp;
+ struct mlx5_ib_alloc_ucontext_req_v2 req = {};
+ struct mlx5_ib_alloc_ucontext_resp resp = {};
struct mlx5_ib_ucontext *context;
struct mlx5_uuar_info *uuari;
struct mlx5_uar *uars;
@@ -599,20 +848,22 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
if (!dev->ib_active)
return ERR_PTR(-EAGAIN);
- memset(&req, 0, sizeof(req));
+ if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
+ return ERR_PTR(-EINVAL);
+
reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
ver = 0;
- else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
+ else if (reqlen >= sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
ver = 2;
else
return ERR_PTR(-EINVAL);
- err = ib_copy_from_udata(&req, udata, reqlen);
+ err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
if (err)
return ERR_PTR(err);
- if (req.flags || req.reserved)
+ if (req.flags)
return ERR_PTR(-EINVAL);
if (req.total_num_uuars > MLX5_MAX_UUARS)
@@ -621,6 +872,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
if (req.total_num_uuars == 0)
return ERR_PTR(-EINVAL);
+ if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (reqlen > sizeof(req) &&
+ !ib_is_udata_cleared(udata, sizeof(req),
+ reqlen - sizeof(req)))
+ return ERR_PTR(-EOPNOTSUPP);
+
req.total_num_uuars = ALIGN(req.total_num_uuars,
MLX5_NON_FP_BF_REGS_PER_PAGE);
if (req.num_low_latency_uuars > req.total_num_uuars - 1)
@@ -636,6 +895,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+ resp.cqe_version = min_t(__u8,
+ (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
+ req.max_cqe_version);
+ resp.response_length = min(offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length), udata->outlen);
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
@@ -681,22 +945,49 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
#endif
+ if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
+ err = mlx5_core_alloc_transport_domain(dev->mdev,
+ &context->tdn);
+ if (err)
+ goto out_uars;
+ }
+
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
resp.tot_uuars = req.total_num_uuars;
resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
- err = ib_copy_to_udata(udata, &resp,
- sizeof(resp) - sizeof(resp.reserved));
+
+ if (field_avail(typeof(resp), cqe_version, udata->outlen))
+ resp.response_length += sizeof(resp.cqe_version);
+
+ if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+ resp.comp_mask |=
+ MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
+ resp.hca_core_clock_offset =
+ offsetof(struct mlx5_init_seg, internal_timer_h) %
+ PAGE_SIZE;
+ resp.response_length += sizeof(resp.hca_core_clock_offset) +
+ sizeof(resp.reserved2) +
+ sizeof(resp.reserved3);
+ }
+
+ err = ib_copy_to_udata(udata, &resp, resp.response_length);
if (err)
- goto out_uars;
+ goto out_td;
uuari->ver = ver;
uuari->num_low_latency_uuars = req.num_low_latency_uuars;
uuari->uars = uars;
uuari->num_uars = num_uars;
+ context->cqe_version = resp.cqe_version;
+
return &context->ibucontext;
+out_td:
+ if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+ mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
out_uars:
for (i--; i >= 0; i--)
mlx5_cmd_free_uar(dev->mdev, uars[i].index);
@@ -721,6 +1012,9 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
struct mlx5_uuar_info *uuari = &context->uuari;
int i;
+ if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+ mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
for (i = 0; i < uuari->num_uars; i++) {
if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
@@ -790,6 +1084,30 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
return -ENOSYS;
+ case MLX5_IB_MMAP_CORE_CLOCK:
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+
+ /* Don't expose to user-space information it shouldn't have */
+ if (PAGE_SIZE > 4096)
+ return -EOPNOTSUPP;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ pfn = (dev->mdev->iseg_base +
+ offsetof(struct mlx5_init_seg, internal_timer_h)) >>
+ PAGE_SHIFT;
+ if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+
+ mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
+ vma->vm_start,
+ (unsigned long long)pfn << PAGE_SHIFT);
+ break;
+
default:
return -EINVAL;
}
@@ -1758,6 +2076,32 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
mlx5_ib_dealloc_pd(devr->p0);
}
+static u32 get_core_cap_flags(struct ib_device *ibdev)
+{
+ struct mlx5_ib_dev *dev = to_mdev(ibdev);
+ enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
+ u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
+ u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
+ u32 ret = 0;
+
+ if (ll == IB_LINK_LAYER_INFINIBAND)
+ return RDMA_CORE_PORT_IBA_IB;
+
+ if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
+ return 0;
+
+ if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
+ return 0;
+
+ if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
+ ret |= RDMA_CORE_PORT_IBA_ROCE;
+
+ if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
+ ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+ return ret;
+}
+
static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
struct ib_port_immutable *immutable)
{
@@ -1770,20 +2114,50 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
- immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+ immutable->core_cap_flags = get_core_cap_flags(ibdev);
immutable->max_mad_size = IB_MGMT_MAD_SIZE;
return 0;
}
+static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
+{
+ int err;
+
+ dev->roce.nb.notifier_call = mlx5_netdev_event;
+ err = register_netdevice_notifier(&dev->roce.nb);
+ if (err)
+ return err;
+
+ err = mlx5_nic_vport_enable_roce(dev->mdev);
+ if (err)
+ goto err_unregister_netdevice_notifier;
+
+ return 0;
+
+err_unregister_netdevice_notifier:
+ unregister_netdevice_notifier(&dev->roce.nb);
+ return err;
+}
+
+static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
+{
+ mlx5_nic_vport_disable_roce(dev->mdev);
+ unregister_netdevice_notifier(&dev->roce.nb);
+}
+
static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
{
struct mlx5_ib_dev *dev;
+ enum rdma_link_layer ll;
+ int port_type_cap;
int err;
int i;
- /* don't create IB instance over Eth ports, no RoCE yet! */
- if (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH)
+ port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+ ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+ if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
return NULL;
printk_once(KERN_INFO "%s", mlx5_version);
@@ -1794,6 +2168,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->mdev = mdev;
+ rwlock_init(&dev->roce.netdev_lock);
err = get_port_caps(dev);
if (err)
goto err_dealloc;
@@ -1843,7 +2218,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->ib_dev.query_device = mlx5_ib_query_device;
dev->ib_dev.query_port = mlx5_ib_query_port;
+ dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer;
+ if (ll == IB_LINK_LAYER_ETHERNET)
+ dev->ib_dev.get_netdev = mlx5_ib_get_netdev;
dev->ib_dev.query_gid = mlx5_ib_query_gid;
+ dev->ib_dev.add_gid = mlx5_ib_add_gid;
+ dev->ib_dev.del_gid = mlx5_ib_del_gid;
dev->ib_dev.query_pkey = mlx5_ib_query_pkey;
dev->ib_dev.modify_device = mlx5_ib_modify_device;
dev->ib_dev.modify_port = mlx5_ib_modify_port;
@@ -1893,7 +2273,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
}
- if (mlx5_ib_port_link_layer(&dev->ib_dev) ==
+ if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
IB_LINK_LAYER_ETHERNET) {
dev->ib_dev.create_flow = mlx5_ib_create_flow;
dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
@@ -1908,9 +2288,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
mutex_init(&dev->flow_db.lock);
mutex_init(&dev->cap_mask_mutex);
+ if (ll == IB_LINK_LAYER_ETHERNET) {
+ err = mlx5_enable_roce(dev);
+ if (err)
+ goto err_dealloc;
+ }
+
err = create_dev_resources(&dev->devr);
if (err)
- goto err_dealloc;
+ goto err_disable_roce;
err = mlx5_ib_odp_init_one(dev);
if (err)
@@ -1947,6 +2333,10 @@ err_odp:
err_rsrc:
destroy_dev_resources(&dev->devr);
+err_disable_roce:
+ if (ll == IB_LINK_LAYER_ETHERNET)
+ mlx5_disable_roce(dev);
+
err_dealloc:
ib_dealloc_device((struct ib_device *)dev);
@@ -1956,11 +2346,14 @@ err_dealloc:
static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
{
struct mlx5_ib_dev *dev = context;
+ enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
ib_unregister_device(&dev->ib_dev);
destroy_umrc_res(dev);
mlx5_ib_odp_remove_one(dev);
destroy_dev_resources(&dev->devr);
+ if (ll == IB_LINK_LAYER_ETHERNET)
+ mlx5_disable_roce(dev);
ib_dealloc_device(&dev->ib_dev);
}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 1474ccc..d2b9737 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -42,6 +42,7 @@
#include <linux/mlx5/qp.h>
#include <linux/mlx5/srq.h>
#include <linux/types.h>
+#include <linux/mlx5/transobj.h>
#define mlx5_ib_dbg(dev, format, arg...) \
pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \
@@ -55,6 +56,11 @@ pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \
pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \
__LINE__, current->pid, ##arg)
+#define field_avail(type, fld, sz) (offsetof(type, fld) + \
+ sizeof(((type *)0)->fld) <= (sz))
+#define MLX5_IB_DEFAULT_UIDX 0xffffff
+#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
+
enum {
MLX5_IB_MMAP_CMD_SHIFT = 8,
MLX5_IB_MMAP_CMD_MASK = 0xff,
@@ -62,7 +68,9 @@ enum {
enum mlx5_ib_mmap_cmd {
MLX5_IB_MMAP_REGULAR_PAGE = 0,
- MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, /* always last */
+ MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1,
+ /* 5 is chosen in order to be compatible with old versions of libmlx5 */
+ MLX5_IB_MMAP_CORE_CLOCK = 5,
};
enum {
@@ -85,6 +93,15 @@ enum mlx5_ib_mad_ifc_flags {
MLX5_MAD_IFC_NET_VIEW = 4,
};
+enum {
+ MLX5_CROSS_CHANNEL_UUAR = 0,
+};
+
+enum {
+ MLX5_CQE_VERSION_V0,
+ MLX5_CQE_VERSION_V1,
+};
+
struct mlx5_ib_ucontext {
struct ib_ucontext ibucontext;
struct list_head db_page_list;
@@ -93,6 +110,9 @@ struct mlx5_ib_ucontext {
*/
struct mutex db_page_mutex;
struct mlx5_uuar_info uuari;
+ u8 cqe_version;
+ /* Transport Domain number */
+ u32 tdn;
};
static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -201,47 +221,70 @@ struct mlx5_ib_pfault {
struct mlx5_pagefault mpfault;
};
+struct mlx5_ib_ubuffer {
+ struct ib_umem *umem;
+ int buf_size;
+ u64 buf_addr;
+};
+
+struct mlx5_ib_qp_base {
+ struct mlx5_ib_qp *container_mibqp;
+ struct mlx5_core_qp mqp;
+ struct mlx5_ib_ubuffer ubuffer;
+};
+
+struct mlx5_ib_qp_trans {
+ struct mlx5_ib_qp_base base;
+ u16 xrcdn;
+ u8 alt_port;
+ u8 atomic_rd_en;
+ u8 resp_depth;
+};
+
struct mlx5_ib_rq {
+ struct mlx5_ib_qp_base base;
+ struct mlx5_ib_wq *rq;
+ struct mlx5_ib_ubuffer ubuffer;
+ struct mlx5_db *doorbell;
u32 tirn;
+ u8 state;
+};
+
+struct mlx5_ib_sq {
+ struct mlx5_ib_qp_base base;
+ struct mlx5_ib_wq *sq;
+ struct mlx5_ib_ubuffer ubuffer;
+ struct mlx5_db *doorbell;
+ u32 tisn;
+ u8 state;
};
struct mlx5_ib_raw_packet_qp {
+ struct mlx5_ib_sq sq;
struct mlx5_ib_rq rq;
};
struct mlx5_ib_qp {
struct ib_qp ibqp;
union {
- struct mlx5_core_qp mqp;
- struct mlx5_ib_raw_packet_qp raw_packet_qp;
+ struct mlx5_ib_qp_trans trans_qp;
+ struct mlx5_ib_raw_packet_qp raw_packet_qp;
};
-
struct mlx5_buf buf;
struct mlx5_db db;
struct mlx5_ib_wq rq;
- u32 doorbell_qpn;
u8 sq_signal_bits;
u8 fm_cache;
- int sq_max_wqes_per_wr;
- int sq_spare_wqes;
struct mlx5_ib_wq sq;
- struct ib_umem *umem;
- int buf_size;
-
/* serialize qp state modifications
*/
struct mutex mutex;
- u16 xrcdn;
u32 flags;
u8 port;
- u8 alt_port;
- u8 atomic_rd_en;
- u8 resp_depth;
u8 state;
- int mlx_type;
int wq_sig;
int scat_cqe;
int max_inline_data;
@@ -284,6 +327,9 @@ struct mlx5_ib_cq_buf {
enum mlx5_ib_qp_flags {
MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0,
MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1,
+ MLX5_IB_QP_CROSS_CHANNEL = 1 << 2,
+ MLX5_IB_QP_MANAGED_SEND = 1 << 3,
+ MLX5_IB_QP_MANAGED_RECV = 1 << 4,
};
struct mlx5_umr_wr {
@@ -326,6 +372,7 @@ struct mlx5_ib_cq {
struct mlx5_ib_cq_buf *resize_buf;
struct ib_umem *resize_umem;
int cqe_size;
+ u32 create_flags;
};
struct mlx5_ib_srq {
@@ -449,9 +496,19 @@ struct mlx5_ib_resources {
struct ib_srq *s1;
};
+struct mlx5_roce {
+ /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
+ * netdev pointer
+ */
+ rwlock_t netdev_lock;
+ struct net_device *netdev;
+ struct notifier_block nb;
+};
+
struct mlx5_ib_dev {
struct ib_device ib_dev;
struct mlx5_core_dev *mdev;
+ struct mlx5_roce roce;
MLX5_DECLARE_DOORBELL_LOCK(uar_lock);
int num_ports;
/* serialize update of capability mask
@@ -498,7 +555,7 @@ static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)
{
- return container_of(mqp, struct mlx5_ib_qp, mqp);
+ return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp;
}
static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr)
@@ -550,8 +607,6 @@ void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
const void *in_mad, void *response_mad);
-struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
- struct mlx5_ib_ah *ah);
struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
int mlx5_ib_destroy_ah(struct ib_ah *ah);
@@ -578,7 +633,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct ib_recv_wr **bad_wr);
void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
- void *buffer, u32 length);
+ void *buffer, u32 length,
+ struct mlx5_ib_qp_base *base);
struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
@@ -680,6 +736,9 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+ int index);
+
static inline void init_query_mad(struct ib_smp *mad)
{
mad->base_version = 1;
@@ -705,4 +764,28 @@ static inline int is_qp1(enum ib_qp_type qp_type)
#define MLX5_MAX_UMR_SHIFT 16
#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
+static inline u32 check_cq_create_flags(u32 flags)
+{
+ /*
+ * It returns non-zero value for unsupported CQ
+ * create flags, otherwise it returns zero.
+ */
+ return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN |
+ IB_CQ_FLAGS_TIMESTAMP_COMPLETION));
+}
+
+static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx,
+ u32 *user_index)
+{
+ if (cqe_version) {
+ if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) ||
+ (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK))
+ return -EINVAL;
+ *user_index = cmd_uidx;
+ } else {
+ *user_index = MLX5_IB_DEFAULT_UIDX;
+ }
+
+ return 0;
+}
#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index aa8391e..b8d7636 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -153,14 +153,16 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault,
- int error) {
+ int error)
+{
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
- int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
+ u32 qpn = qp->trans_qp.base.mqp.qpn;
+ int ret = mlx5_core_page_fault_resume(dev->mdev,
+ qpn,
pfault->mpfault.flags,
error);
if (ret)
- pr_err("Failed to resolve the page fault on QP 0x%x\n",
- qp->mqp.qpn);
+ pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn);
}
/*
@@ -391,6 +393,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
#if defined(DEBUG)
u32 ctrl_wqe_index, ctrl_qpn;
#endif
+ u32 qpn = qp->trans_qp.base.mqp.qpn;
ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
@@ -401,7 +404,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
if (ds == 0) {
mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
- wqe_index, qp->mqp.qpn);
+ wqe_index, qpn);
return -EFAULT;
}
@@ -411,16 +414,16 @@ static int mlx5_ib_mr_initiator_pfault_handler(
MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
if (wqe_index != ctrl_wqe_index) {
mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
- wqe_index, qp->mqp.qpn,
+ wqe_index, qpn,
ctrl_wqe_index);
return -EFAULT;
}
ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
MLX5_WQE_CTRL_QPN_SHIFT;
- if (qp->mqp.qpn != ctrl_qpn) {
+ if (qpn != ctrl_qpn) {
mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
- wqe_index, qp->mqp.qpn,
+ wqe_index, qpn,
ctrl_qpn);
return -EFAULT;
}
@@ -537,6 +540,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
int resume_with_error = 0;
u16 wqe_index = pfault->mpfault.wqe.wqe_index;
int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
+ u32 qpn = qp->trans_qp.base.mqp.qpn;
buffer = (char *)__get_free_page(GFP_KERNEL);
if (!buffer) {
@@ -546,10 +550,10 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
}
ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
- PAGE_SIZE);
+ PAGE_SIZE, &qp->trans_qp.base);
if (ret < 0) {
mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
- -ret, wqe_index, qp->mqp.qpn);
+ -ret, wqe_index, qpn);
resume_with_error = 1;
goto resolve_page_fault;
}
@@ -586,7 +590,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
resolve_page_fault:
mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
- qp->mqp.qpn, resume_with_error, pfault->mpfault.flags);
+ qpn, resume_with_error,
+ pfault->mpfault.flags);
free_page((unsigned long)buffer);
}
@@ -753,7 +758,7 @@ void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
qp->disable_page_faults = 1;
spin_lock_init(&qp->disable_page_faults_lock);
- qp->mqp.pfault_handler = mlx5_ib_pfault_handler;
+ qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler;
for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 307bdbc..8fb9c27 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -32,6 +32,8 @@
#include <linux/module.h>
#include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_user_verbs.h>
#include "mlx5_ib.h"
#include "user.h"
@@ -114,14 +116,15 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
* Return: the number of bytes copied, or an error code.
*/
int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
- void *buffer, u32 length)
+ void *buffer, u32 length,
+ struct mlx5_ib_qp_base *base)
{
struct ib_device *ibdev = qp->ibqp.device;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
size_t offset;
size_t wq_end;
- struct ib_umem *umem = qp->umem;
+ struct ib_umem *umem = base->ubuffer.umem;
u32 first_copy_length;
int wqe_length;
int ret;
@@ -172,8 +175,10 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
struct ib_event event;
- if (type == MLX5_EVENT_TYPE_PATH_MIG)
- to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+ if (type == MLX5_EVENT_TYPE_PATH_MIG) {
+ /* This event is only valid for trans_qps */
+ to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port;
+ }
if (ibqp->event_handler) {
event.device = ibqp->device;
@@ -366,7 +371,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
static int set_user_buf_size(struct mlx5_ib_dev *dev,
struct mlx5_ib_qp *qp,
- struct mlx5_ib_create_qp *ucmd)
+ struct mlx5_ib_create_qp *ucmd,
+ struct mlx5_ib_qp_base *base,
+ struct ib_qp_init_attr *attr)
{
int desc_sz = 1 << qp->sq.wqe_shift;
@@ -391,8 +398,13 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev,
return -EINVAL;
}
- qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
- (qp->sq.wqe_cnt << 6);
+ if (attr->qp_type == IB_QPT_RAW_PACKET) {
+ base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+ qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
+ } else {
+ base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << 6);
+ }
return 0;
}
@@ -578,8 +590,8 @@ static int to_mlx5_st(enum ib_qp_type type)
case IB_QPT_SMI: return MLX5_QP_ST_QP0;
case IB_QPT_GSI: return MLX5_QP_ST_QP1;
case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6;
- case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE;
case IB_QPT_RAW_PACKET:
+ case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE;
case IB_QPT_MAX:
default: return -EINVAL;
}
@@ -590,13 +602,51 @@ static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
}
+static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
+ struct ib_pd *pd,
+ unsigned long addr, size_t size,
+ struct ib_umem **umem,
+ int *npages, int *page_shift, int *ncont,
+ u32 *offset)
+{
+ int err;
+
+ *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+ if (IS_ERR(*umem)) {
+ mlx5_ib_dbg(dev, "umem_get failed\n");
+ return PTR_ERR(*umem);
+ }
+
+ mlx5_ib_cont_pages(*umem, addr, npages, page_shift, ncont, NULL);
+
+ err = mlx5_ib_get_buf_offset(addr, *page_shift, offset);
+ if (err) {
+ mlx5_ib_warn(dev, "bad offset\n");
+ goto err_umem;
+ }
+
+ mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n",
+ addr, size, *npages, *page_shift, *ncont, *offset);
+
+ return 0;
+
+err_umem:
+ ib_umem_release(*umem);
+ *umem = NULL;
+
+ return err;
+}
+
static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
struct mlx5_ib_qp *qp, struct ib_udata *udata,
+ struct ib_qp_init_attr *attr,
struct mlx5_create_qp_mbox_in **in,
- struct mlx5_ib_create_qp_resp *resp, int *inlen)
+ struct mlx5_ib_create_qp_resp *resp, int *inlen,
+ struct mlx5_ib_qp_base *base)
{
struct mlx5_ib_ucontext *context;
struct mlx5_ib_create_qp ucmd;
+ struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
int page_shift = 0;
int uar_index;
int npages;
@@ -615,18 +665,23 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
/*
* TBD: should come from the verbs when we have the API
*/
- uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
- if (uuarn < 0) {
- mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
- mlx5_ib_dbg(dev, "reverting to medium latency\n");
- uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
+ if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+ /* In CROSS_CHANNEL CQ and QP must use the same UAR */
+ uuarn = MLX5_CROSS_CHANNEL_UUAR;
+ else {
+ uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
if (uuarn < 0) {
- mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
- mlx5_ib_dbg(dev, "reverting to high latency\n");
- uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+ mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+ mlx5_ib_dbg(dev, "reverting to medium latency\n");
+ uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
if (uuarn < 0) {
- mlx5_ib_warn(dev, "uuar allocation failed\n");
- return uuarn;
+ mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
+ mlx5_ib_dbg(dev, "reverting to high latency\n");
+ uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+ if (uuarn < 0) {
+ mlx5_ib_warn(dev, "uuar allocation failed\n");
+ return uuarn;
+ }
}
}
}
@@ -638,32 +693,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
- err = set_user_buf_size(dev, qp, &ucmd);
+ err = set_user_buf_size(dev, qp, &ucmd, base, attr);
if (err)
goto err_uuar;
- if (ucmd.buf_addr && qp->buf_size) {
- qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
- qp->buf_size, 0, 0);
- if (IS_ERR(qp->umem)) {
- mlx5_ib_dbg(dev, "umem_get failed\n");
- err = PTR_ERR(qp->umem);
+ if (ucmd.buf_addr && ubuffer->buf_size) {
+ ubuffer->buf_addr = ucmd.buf_addr;
+ err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
+ ubuffer->buf_size,
+ &ubuffer->umem, &npages, &page_shift,
+ &ncont, &offset);
+ if (err)
goto err_uuar;
- }
} else {
- qp->umem = NULL;
- }
-
- if (qp->umem) {
- mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift,
- &ncont, NULL);
- err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset);
- if (err) {
- mlx5_ib_warn(dev, "bad offset\n");
- goto err_umem;
- }
- mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n",
- ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);
+ ubuffer->umem = NULL;
}
*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
@@ -672,8 +715,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
err = -ENOMEM;
goto err_umem;
}
- if (qp->umem)
- mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
+ if (ubuffer->umem)
+ mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift,
+ (*in)->pas, 0);
(*in)->ctx.log_pg_sz_remote_qpn =
cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
(*in)->ctx.params2 = cpu_to_be32(offset << 6);
@@ -704,29 +748,31 @@ err_free:
kvfree(*in);
err_umem:
- if (qp->umem)
- ib_umem_release(qp->umem);
+ if (ubuffer->umem)
+ ib_umem_release(ubuffer->umem);
err_uuar:
free_uuar(&context->uuari, uuarn);
return err;
}
-static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)
+static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp,
+ struct mlx5_ib_qp_base *base)
{
struct mlx5_ib_ucontext *context;
context = to_mucontext(pd->uobject->context);
mlx5_ib_db_unmap_user(context, &qp->db);
- if (qp->umem)
- ib_umem_release(qp->umem);
+ if (base->ubuffer.umem)
+ ib_umem_release(base->ubuffer.umem);
free_uuar(&context->uuari, qp->uuarn);
}
static int create_kernel_qp(struct mlx5_ib_dev *dev,
struct ib_qp_init_attr *init_attr,
struct mlx5_ib_qp *qp,
- struct mlx5_create_qp_mbox_in **in, int *inlen)
+ struct mlx5_create_qp_mbox_in **in, int *inlen,
+ struct mlx5_ib_qp_base *base)
{
enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
struct mlx5_uuar_info *uuari;
@@ -758,9 +804,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
qp->rq.offset = 0;
qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
- qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+ base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
- err = mlx5_buf_alloc(dev->mdev, qp->buf_size, &qp->buf);
+ err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
if (err) {
mlx5_ib_dbg(dev, "err %d\n", err);
goto err_uuar;
@@ -853,19 +899,304 @@ static int is_connected(enum ib_qp_type qp_type)
return 0;
}
+static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_sq *sq, u32 tdn)
+{
+ u32 in[MLX5_ST_SZ_DW(create_tis_in)];
+ void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+ memset(in, 0, sizeof(in));
+
+ MLX5_SET(tisc, tisc, transport_domain, tdn);
+
+ return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
+}
+
+static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_sq *sq)
+{
+ mlx5_core_destroy_tis(dev->mdev, sq->tisn);
+}
+
+static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_sq *sq, void *qpin,
+ struct ib_pd *pd)
+{
+ struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer;
+ __be64 *pas;
+ void *in;
+ void *sqc;
+ void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+ void *wq;
+ int inlen;
+ int err;
+ int page_shift = 0;
+ int npages;
+ int ncont = 0;
+ u32 offset = 0;
+
+ err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
+ &sq->ubuffer.umem, &npages, &page_shift,
+ &ncont, &offset);
+ if (err)
+ return err;
+
+ inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont;
+ in = mlx5_vzalloc(inlen);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_umem;
+ }
+
+ sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+ MLX5_SET(sqc, sqc, flush_in_error_en, 1);
+ MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
+ MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index));
+ MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd));
+ MLX5_SET(sqc, sqc, tis_lst_sz, 1);
+ MLX5_SET(sqc, sqc, tis_num_0, sq->tisn);
+
+ wq = MLX5_ADDR_OF(sqc, sqc, wq);
+ MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+ MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+ MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page));
+ MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+ MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+ MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size));
+ MLX5_SET(wq, wq, log_wq_pg_sz, page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+ MLX5_SET(wq, wq, page_offset, offset);
+
+ pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+ mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);
+
+ err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp);
+
+ kvfree(in);
+
+ if (err)
+ goto err_umem;
+
+ return 0;
+
+err_umem:
+ ib_umem_release(sq->ubuffer.umem);
+ sq->ubuffer.umem = NULL;
+
+ return err;
+}
+
+static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_sq *sq)
+{
+ mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
+ ib_umem_release(sq->ubuffer.umem);
+}
+
+static int get_rq_pas_size(void *qpc)
+{
+ u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12;
+ u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride);
+ u32 log_rq_size = MLX5_GET(qpc, qpc, log_rq_size);
+ u32 page_offset = MLX5_GET(qpc, qpc, page_offset);
+ u32 po_quanta = 1 << (log_page_size - 6);
+ u32 rq_sz = 1 << (log_rq_size + 4 + log_rq_stride);
+ u32 page_size = 1 << log_page_size;
+ u32 rq_sz_po = rq_sz + (page_offset * po_quanta);
+ u32 rq_num_pas = (rq_sz_po + page_size - 1) / page_size;
+
+ return rq_num_pas * sizeof(u64);
+}
+
+static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_rq *rq, void *qpin)
+{
+ __be64 *pas;
+ __be64 *qp_pas;
+ void *in;
+ void *rqc;
+ void *wq;
+ void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+ int inlen;
+ int err;
+ u32 rq_pas_size = get_rq_pas_size(qpc);
+
+ inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size;
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+ MLX5_SET(rqc, rqc, vsd, 1);
+ MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+ MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+ MLX5_SET(rqc, rqc, flush_in_error_en, 1);
+ MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
+ MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));
+
+ wq = MLX5_ADDR_OF(rqc, rqc, wq);
+ MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+ MLX5_SET(wq, wq, end_padding_mode,
+ MLX5_GET64(qpc, qpc, end_padding_mode));
+ MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset));
+ MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+ MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+ MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4);
+ MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size));
+ MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size));
+
+ pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+ qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas);
+ memcpy(pas, qp_pas, rq_pas_size);
+
+ err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp);
+
+ kvfree(in);
+
+ return err;
+}
+
+static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_rq *rq)
+{
+ mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp);
+}
+
+static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_rq *rq, u32 tdn)
+{
+ u32 *in;
+ void *tirc;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+ MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+ MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn);
+ MLX5_SET(tirc, tirc, transport_domain, tdn);
+
+ err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);
+
+ kvfree(in);
+
+ return err;
+}
+
+static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_rq *rq)
+{
+ mlx5_core_destroy_tir(dev->mdev, rq->tirn);
+}
+
+static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+ struct mlx5_create_qp_mbox_in *in,
+ struct ib_pd *pd)
+{
+ struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+ struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+ struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+ struct ib_uobject *uobj = pd->uobject;
+ struct ib_ucontext *ucontext = uobj->context;
+ struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+ int err;
+ u32 tdn = mucontext->tdn;
+
+ if (qp->sq.wqe_cnt) {
+ err = create_raw_packet_qp_tis(dev, sq, tdn);
+ if (err)
+ return err;
+
+ err = create_raw_packet_qp_sq(dev, sq, in, pd);
+ if (err)
+ goto err_destroy_tis;
+
+ sq->base.container_mibqp = qp;
+ }
+
+ if (qp->rq.wqe_cnt) {
+ err = create_raw_packet_qp_rq(dev, rq, in);
+ if (err)
+ goto err_destroy_sq;
+
+ rq->base.container_mibqp = qp;
+
+ err = create_raw_packet_qp_tir(dev, rq, tdn);
+ if (err)
+ goto err_destroy_rq;
+ }
+
+ qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn :
+ rq->base.mqp.qpn;
+
+ return 0;
+
+err_destroy_rq:
+ destroy_raw_packet_qp_rq(dev, rq);
+err_destroy_sq:
+ if (!qp->sq.wqe_cnt)
+ return err;
+ destroy_raw_packet_qp_sq(dev, sq);
+err_destroy_tis:
+ destroy_raw_packet_qp_tis(dev, sq);
+
+ return err;
+}
+
+static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_qp *qp)
+{
+ struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+ struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+ struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+ if (qp->rq.wqe_cnt) {
+ destroy_raw_packet_qp_tir(dev, rq);
+ destroy_raw_packet_qp_rq(dev, rq);
+ }
+
+ if (qp->sq.wqe_cnt) {
+ destroy_raw_packet_qp_sq(dev, sq);
+ destroy_raw_packet_qp_tis(dev, sq);
+ }
+}
+
+static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
+ struct mlx5_ib_raw_packet_qp *raw_packet_qp)
+{
+ struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+ struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+ sq->sq = &qp->sq;
+ rq->rq = &qp->rq;
+ sq->doorbell = &qp->db;
+ rq->doorbell = &qp->db;
+}
+
static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata, struct mlx5_ib_qp *qp)
{
struct mlx5_ib_resources *devr = &dev->devr;
struct mlx5_core_dev *mdev = dev->mdev;
+ struct mlx5_ib_qp_base *base;
struct mlx5_ib_create_qp_resp resp;
struct mlx5_create_qp_mbox_in *in;
struct mlx5_ib_create_qp ucmd;
int inlen = sizeof(*in);
int err;
+ u32 uidx = MLX5_IB_DEFAULT_UIDX;
+ void *qpc;
+
+ base = init_attr->qp_type == IB_QPT_RAW_PACKET ?
+ &qp->raw_packet_qp.rq.base :
+ &qp->trans_qp.base;
- mlx5_ib_odp_create_qp(qp);
+ if (init_attr->qp_type != IB_QPT_RAW_PACKET)
+ mlx5_ib_odp_create_qp(qp);
mutex_init(&qp->mutex);
spin_lock_init(&qp->sq.lock);
@@ -880,6 +1211,21 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
}
}
+ if (init_attr->create_flags &
+ (IB_QP_CREATE_CROSS_CHANNEL |
+ IB_QP_CREATE_MANAGED_SEND |
+ IB_QP_CREATE_MANAGED_RECV)) {
+ if (!MLX5_CAP_GEN(mdev, cd)) {
+ mlx5_ib_dbg(dev, "cross-channel isn't supported\n");
+ return -EINVAL;
+ }
+ if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL)
+ qp->flags |= MLX5_IB_QP_CROSS_CHANNEL;
+ if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND)
+ qp->flags |= MLX5_IB_QP_MANAGED_SEND;
+ if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
+ qp->flags |= MLX5_IB_QP_MANAGED_RECV;
+ }
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
@@ -889,6 +1235,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
return -EFAULT;
}
+ err = get_qp_user_index(to_mucontext(pd->uobject->context),
+ &ucmd, udata->inlen, &uidx);
+ if (err)
+ return err;
+
qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
} else {
@@ -918,11 +1269,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
ucmd.sq_wqe_count, max_wqes);
return -EINVAL;
}
- err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen);
+ err = create_user_qp(dev, pd, qp, udata, init_attr, &in,
+ &resp, &inlen, base);
if (err)
mlx5_ib_dbg(dev, "err %d\n", err);
} else {
- err = create_kernel_qp(dev, init_attr, qp, &in, &inlen);
+ err = create_kernel_qp(dev, init_attr, qp, &in, &inlen,
+ base);
if (err)
mlx5_ib_dbg(dev, "err %d\n", err);
}
@@ -954,6 +1307,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST);
+ if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+ in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_MASTER);
+ if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+ in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_SEND);
+ if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+ in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_RECV);
+
if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
int rcqe_sz;
int scqe_sz;
@@ -1018,26 +1378,35 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma);
- err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen);
+ if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+ qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+ /* 0xffffff means we ask to work with cqe version 0 */
+ MLX5_SET(qpc, qpc, user_index, uidx);
+ }
+
+ if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+ qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
+ raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
+ err = create_raw_packet_qp(dev, qp, in, pd);
+ } else {
+ err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen);
+ }
+
if (err) {
mlx5_ib_dbg(dev, "create qp failed\n");
goto err_create;
}
kvfree(in);
- /* Hardware wants QPN written in big-endian order (after
- * shifting) for send doorbell. Precompute this value to save
- * a little bit when posting sends.
- */
- qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
- qp->mqp.event = mlx5_ib_qp_event;
+ base->container_mibqp = qp;
+ base->mqp.event = mlx5_ib_qp_event;
return 0;
err_create:
if (qp->create_type == MLX5_QP_USER)
- destroy_qp_user(pd, qp);
+ destroy_qp_user(pd, qp, base);
else if (qp->create_type == MLX5_QP_KERNEL)
destroy_qp_kernel(dev, qp);
@@ -1129,11 +1498,11 @@ static void get_cqs(struct mlx5_ib_qp *qp,
case IB_QPT_UD:
case IB_QPT_RAW_IPV6:
case IB_QPT_RAW_ETHERTYPE:
+ case IB_QPT_RAW_PACKET:
*send_cq = to_mcq(qp->ibqp.send_cq);
*recv_cq = to_mcq(qp->ibqp.recv_cq);
break;
- case IB_QPT_RAW_PACKET:
case IB_QPT_MAX:
default:
*send_cq = NULL;
@@ -1142,45 +1511,66 @@ static void get_cqs(struct mlx5_ib_qp *qp,
}
}
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+ u16 operation);
+
static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
{
struct mlx5_ib_cq *send_cq, *recv_cq;
+ struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct mlx5_modify_qp_mbox_in *in;
int err;
+ base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
+ &qp->raw_packet_qp.rq.base :
+ &qp->trans_qp.base;
+
in = kzalloc(sizeof(*in), GFP_KERNEL);
if (!in)
return;
if (qp->state != IB_QPS_RESET) {
- mlx5_ib_qp_disable_pagefaults(qp);
- if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
- MLX5_QP_STATE_RST, in, 0, &qp->mqp))
- mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
- qp->mqp.qpn);
+ if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
+ mlx5_ib_qp_disable_pagefaults(qp);
+ err = mlx5_core_qp_modify(dev->mdev,
+ MLX5_CMD_OP_2RST_QP, in, 0,
+ &base->mqp);
+ } else {
+ err = modify_raw_packet_qp(dev, qp,
+ MLX5_CMD_OP_2RST_QP);
+ }
+ if (err)
+ mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n",
+ base->mqp.qpn);
}
get_cqs(qp, &send_cq, &recv_cq);
if (qp->create_type == MLX5_QP_KERNEL) {
mlx5_ib_lock_cqs(send_cq, recv_cq);
- __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+ __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
if (send_cq != recv_cq)
- __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+ __mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
+ NULL);
mlx5_ib_unlock_cqs(send_cq, recv_cq);
}
- err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp);
- if (err)
- mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn);
- kfree(in);
+ if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+ destroy_raw_packet_qp(dev, qp);
+ } else {
+ err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
+ if (err)
+ mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n",
+ base->mqp.qpn);
+ }
+ kfree(in);
if (qp->create_type == MLX5_QP_KERNEL)
destroy_qp_kernel(dev, qp);
else if (qp->create_type == MLX5_QP_USER)
- destroy_qp_user(&get_pd(qp)->ibpd, qp);
+ destroy_qp_user(&get_pd(qp)->ibpd, qp, base);
}
static const char *ib_qp_type_str(enum ib_qp_type type)
@@ -1234,6 +1624,16 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
return ERR_PTR(-EINVAL);
}
dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
+
+ if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+ if (!pd->uobject) {
+ mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
+ return ERR_PTR(-EINVAL);
+ } else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+ mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
+ return ERR_PTR(-EINVAL);
+ }
+ }
}
switch (init_attr->qp_type) {
@@ -1250,6 +1650,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
}
/* fall through */
+ case IB_QPT_RAW_PACKET:
case IB_QPT_RC:
case IB_QPT_UC:
case IB_QPT_UD:
@@ -1272,19 +1673,19 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
else if (is_qp1(init_attr->qp_type))
qp->ibqp.qp_num = 1;
else
- qp->ibqp.qp_num = qp->mqp.qpn;
+ qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn;
mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
- qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn,
+ qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn,
+ to_mcq(init_attr->recv_cq)->mcq.cqn,
to_mcq(init_attr->send_cq)->mcq.cqn);
- qp->xrcdn = xrcdn;
+ qp->trans_qp.xrcdn = xrcdn;
break;
case IB_QPT_RAW_IPV6:
case IB_QPT_RAW_ETHERTYPE:
- case IB_QPT_RAW_PACKET:
case IB_QPT_MAX:
default:
mlx5_ib_dbg(dev, "unsupported qp type %d\n",
@@ -1318,12 +1719,12 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att
if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
dest_rd_atomic = attr->max_dest_rd_atomic;
else
- dest_rd_atomic = qp->resp_depth;
+ dest_rd_atomic = qp->trans_qp.resp_depth;
if (attr_mask & IB_QP_ACCESS_FLAGS)
access_flags = attr->qp_access_flags;
else
- access_flags = qp->atomic_rd_en;
+ access_flags = qp->trans_qp.atomic_rd_en;
if (!dest_rd_atomic)
access_flags &= IB_ACCESS_REMOTE_WRITE;
@@ -1360,21 +1761,42 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
return rate + MLX5_STAT_RATE_OFFSET;
}
-static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
+static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
+ struct mlx5_ib_sq *sq, u8 sl)
+{
+ void *in;
+ void *tisc;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_tis_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(modify_tis_in, in, bitmask.prio, 1);
+
+ tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
+ MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1));
+
+ err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);
+
+ kvfree(in);
+
+ return err;
+}
+
+static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+ const struct ib_ah_attr *ah,
struct mlx5_qp_path *path, u8 port, int attr_mask,
u32 path_flags, const struct ib_qp_attr *attr)
{
+ enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port);
int err;
- path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
- path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0;
-
if (attr_mask & IB_QP_PKEY_INDEX)
path->pkey_index = attr->pkey_index;
- path->grh_mlid = ah->src_path_bits & 0x7f;
- path->rlid = cpu_to_be16(ah->dlid);
-
if (ah->ah_flags & IB_AH_GRH) {
if (ah->grh.sgid_index >=
dev->mdev->port_caps[port - 1].gid_table_len) {
@@ -1383,7 +1805,27 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
dev->mdev->port_caps[port - 1].gid_table_len);
return -EINVAL;
}
- path->grh_mlid |= 1 << 7;
+ }
+
+ if (ll == IB_LINK_LAYER_ETHERNET) {
+ if (!(ah->ah_flags & IB_AH_GRH))
+ return -EINVAL;
+ memcpy(path->rmac, ah->dmac, sizeof(ah->dmac));
+ path->udp_sport = mlx5_get_roce_udp_sport(dev, port,
+ ah->grh.sgid_index);
+ path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4;
+ } else {
+ path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+ path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 :
+ 0;
+ path->rlid = cpu_to_be16(ah->dlid);
+ path->grh_mlid = ah->src_path_bits & 0x7f;
+ if (ah->ah_flags & IB_AH_GRH)
+ path->grh_mlid |= 1 << 7;
+ path->dci_cfi_prio_sl = ah->sl & 0xf;
+ }
+
+ if (ah->ah_flags & IB_AH_GRH) {
path->mgid_index = ah->grh.sgid_index;
path->hop_limit = ah->grh.hop_limit;
path->tclass_flowlabel =
@@ -1401,7 +1843,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
if (attr_mask & IB_QP_TIMEOUT)
path->ackto_lt = attr->timeout << 3;
- path->sl = ah->sl & 0xf;
+ if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
+ return modify_raw_packet_eth_prio(dev->mdev,
+ &qp->raw_packet_qp.sq,
+ ah->sl & 0xf);
return 0;
}
@@ -1549,12 +1994,154 @@ static int ib_mask_to_mlx5_opt(int ib_mask)
return result;
}
+static int modify_raw_packet_qp_rq(struct mlx5_core_dev *dev,
+ struct mlx5_ib_rq *rq, int new_state)
+{
+ void *in;
+ void *rqc;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(modify_rq_in, in, rq_state, rq->state);
+
+ rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+ MLX5_SET(rqc, rqc, state, new_state);
+
+ err = mlx5_core_modify_rq(dev, rq->base.mqp.qpn, in, inlen);
+ if (err)
+ goto out;
+
+ rq->state = new_state;
+
+out:
+ kvfree(in);
+ return err;
+}
+
+static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev,
+ struct mlx5_ib_sq *sq, int new_state)
+{
+ void *in;
+ void *sqc;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(modify_sq_in, in, sq_state, sq->state);
+
+ sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+ MLX5_SET(sqc, sqc, state, new_state);
+
+ err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen);
+ if (err)
+ goto out;
+
+ sq->state = new_state;
+
+out:
+ kvfree(in);
+ return err;
+}
+
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+ u16 operation)
+{
+ struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+ struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+ struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+ int rq_state;
+ int sq_state;
+ int err;
+
+ switch (operation) {
+ case MLX5_CMD_OP_RST2INIT_QP:
+ rq_state = MLX5_RQC_STATE_RDY;
+ sq_state = MLX5_SQC_STATE_RDY;
+ break;
+ case MLX5_CMD_OP_2ERR_QP:
+ rq_state = MLX5_RQC_STATE_ERR;
+ sq_state = MLX5_SQC_STATE_ERR;
+ break;
+ case MLX5_CMD_OP_2RST_QP:
+ rq_state = MLX5_RQC_STATE_RST;
+ sq_state = MLX5_SQC_STATE_RST;
+ break;
+ case MLX5_CMD_OP_INIT2INIT_QP:
+ case MLX5_CMD_OP_INIT2RTR_QP:
+ case MLX5_CMD_OP_RTR2RTS_QP:
+ case MLX5_CMD_OP_RTS2RTS_QP:
+ /* Nothing to do here... */
+ return 0;
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (qp->rq.wqe_cnt) {
+ err = modify_raw_packet_qp_rq(dev->mdev, rq, rq_state);
+ if (err)
+ return err;
+ }
+
+ if (qp->sq.wqe_cnt)
+ return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state);
+
+ return 0;
+}
+
static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask,
enum ib_qp_state cur_state, enum ib_qp_state new_state)
{
+ static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
+ [MLX5_QP_STATE_RST] = {
+ [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
+ [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
+ [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP,
+ },
+ [MLX5_QP_STATE_INIT] = {
+ [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
+ [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
+ [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP,
+ [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP,
+ },
+ [MLX5_QP_STATE_RTR] = {
+ [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
+ [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
+ [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP,
+ },
+ [MLX5_QP_STATE_RTS] = {
+ [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
+ [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
+ [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP,
+ },
+ [MLX5_QP_STATE_SQD] = {
+ [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
+ [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
+ },
+ [MLX5_QP_STATE_SQER] = {
+ [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
+ [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
+ [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP,
+ },
+ [MLX5_QP_STATE_ERR] = {
+ [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
+ [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
+ }
+ };
+
struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
struct mlx5_ib_qp *qp = to_mqp(ibqp);
+ struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct mlx5_ib_cq *send_cq, *recv_cq;
struct mlx5_qp_context *context;
struct mlx5_modify_qp_mbox_in *in;
@@ -1564,6 +2151,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
int sqd_event;
int mlx5_st;
int err;
+ u16 op;
in = kzalloc(sizeof(*in), GFP_KERNEL);
if (!in)
@@ -1623,7 +2211,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
context->pri_path.port = attr->port_num;
if (attr_mask & IB_QP_AV) {
- err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path,
+ err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path,
attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
attr_mask, 0, attr);
if (err)
@@ -1634,7 +2222,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
context->pri_path.ackto_lt |= attr->timeout << 3;
if (attr_mask & IB_QP_ALT_PATH) {
- err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+ err = mlx5_set_path(dev, qp, &attr->alt_ah_attr,
+ &context->alt_path,
attr->alt_port_num, attr_mask, 0, attr);
if (err)
goto out;
@@ -1706,41 +2295,51 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
* again to RTS, and may cause the driver and the device to get out of
* sync. */
if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
- (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+ (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) &&
+ (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
mlx5_ib_qp_disable_pagefaults(qp);
+ if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
+ !optab[mlx5_cur][mlx5_new])
+ goto out;
+
+ op = optab[mlx5_cur][mlx5_new];
optpar = ib_mask_to_mlx5_opt(attr_mask);
optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
in->optparam = cpu_to_be32(optpar);
- err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state),
- to_mlx5_state(new_state), in, sqd_event,
- &qp->mqp);
+
+ if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)
+ err = modify_raw_packet_qp(dev, qp, op);
+ else
+ err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event,
+ &base->mqp);
if (err)
goto out;
- if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT &&
+ (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
mlx5_ib_qp_enable_pagefaults(qp);
qp->state = new_state;
if (attr_mask & IB_QP_ACCESS_FLAGS)
- qp->atomic_rd_en = attr->qp_access_flags;
+ qp->trans_qp.atomic_rd_en = attr->qp_access_flags;
if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
- qp->resp_depth = attr->max_dest_rd_atomic;
+ qp->trans_qp.resp_depth = attr->max_dest_rd_atomic;
if (attr_mask & IB_QP_PORT)
qp->port = attr->port_num;
if (attr_mask & IB_QP_ALT_PATH)
- qp->alt_port = attr->alt_port_num;
+ qp->trans_qp.alt_port = attr->alt_port_num;
/*
* If we moved a kernel QP to RESET, clean up all old CQ
* entries and reinitialize the QP.
*/
if (new_state == IB_QPS_RESET && !ibqp->uobject) {
- mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+ mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
ibqp->srq ? to_msrq(ibqp->srq) : NULL);
if (send_cq != recv_cq)
- mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+ mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL);
qp->rq.head = 0;
qp->rq.tail = 0;
@@ -1765,15 +2364,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
enum ib_qp_state cur_state, new_state;
int err = -EINVAL;
int port;
+ enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
mutex_lock(&qp->mutex);
cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+ if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) {
+ port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
+ }
+
if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR &&
!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
- IB_LINK_LAYER_UNSPECIFIED))
+ ll))
goto out;
if ((attr_mask & IB_QP_PORT) &&
@@ -2570,7 +3175,7 @@ static void finish_wqe(struct mlx5_ib_qp *qp,
ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
mlx5_opcode | ((u32)opmod << 24));
- ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8));
+ ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8));
ctrl->fm_ce_se |= fence;
qp->fm_cache = next_fence;
if (unlikely(qp->wq_sig))
@@ -3003,7 +3608,7 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at
ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports))
return;
- ib_ah_attr->sl = path->sl & 0xf;
+ ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf;
ib_ah_attr->dlid = be16_to_cpu(path->rlid);
ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
@@ -3021,39 +3626,153 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at
}
}
-int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
- struct ib_qp_init_attr *qp_init_attr)
+static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_sq *sq,
+ u8 *sq_state)
+{
+ void *out;
+ void *sqc;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(query_sq_out);
+ out = mlx5_vzalloc(inlen);
+ if (!out)
+ return -ENOMEM;
+
+ err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out);
+ if (err)
+ goto out;
+
+ sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context);
+ *sq_state = MLX5_GET(sqc, sqc, state);
+ sq->state = *sq_state;
+
+out:
+ kvfree(out);
+ return err;
+}
+
+static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_rq *rq,
+ u8 *rq_state)
+{
+ void *out;
+ void *rqc;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(query_rq_out);
+ out = mlx5_vzalloc(inlen);
+ if (!out)
+ return -ENOMEM;
+
+ err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out);
+ if (err)
+ goto out;
+
+ rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context);
+ *rq_state = MLX5_GET(rqc, rqc, state);
+ rq->state = *rq_state;
+
+out:
+ kvfree(out);
+ return err;
+}
+
+static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state,
+ struct mlx5_ib_qp *qp, u8 *qp_state)
+{
+ static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = {
+ [MLX5_RQC_STATE_RST] = {
+ [MLX5_SQC_STATE_RST] = IB_QPS_RESET,
+ [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD,
+ [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD,
+ [MLX5_SQ_STATE_NA] = IB_QPS_RESET,
+ },
+ [MLX5_RQC_STATE_RDY] = {
+ [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD,
+ [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE,
+ [MLX5_SQC_STATE_ERR] = IB_QPS_SQE,
+ [MLX5_SQ_STATE_NA] = MLX5_QP_STATE,
+ },
+ [MLX5_RQC_STATE_ERR] = {
+ [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD,
+ [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD,
+ [MLX5_SQC_STATE_ERR] = IB_QPS_ERR,
+ [MLX5_SQ_STATE_NA] = IB_QPS_ERR,
+ },
+ [MLX5_RQ_STATE_NA] = {
+ [MLX5_SQC_STATE_RST] = IB_QPS_RESET,
+ [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE,
+ [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE,
+ [MLX5_SQ_STATE_NA] = MLX5_QP_STATE_BAD,
+ },
+ };
+
+ *qp_state = sqrq_trans[rq_state][sq_state];
+
+ if (*qp_state == MLX5_QP_STATE_BAD) {
+ WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x",
+ qp->raw_packet_qp.sq.base.mqp.qpn, sq_state,
+ qp->raw_packet_qp.rq.base.mqp.qpn, rq_state);
+ return -EINVAL;
+ }
+
+ if (*qp_state == MLX5_QP_STATE)
+ *qp_state = qp->state;
+
+ return 0;
+}
+
+static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_qp *qp,
+ u8 *raw_packet_qp_state)
+{
+ struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+ struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+ struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+ int err;
+ u8 sq_state = MLX5_SQ_STATE_NA;
+ u8 rq_state = MLX5_RQ_STATE_NA;
+
+ if (qp->sq.wqe_cnt) {
+ err = query_raw_packet_qp_sq_state(dev, sq, &sq_state);
+ if (err)
+ return err;
+ }
+
+ if (qp->rq.wqe_cnt) {
+ err = query_raw_packet_qp_rq_state(dev, rq, &rq_state);
+ if (err)
+ return err;
+ }
+
+ return sqrq_state_to_qp_state(sq_state, rq_state, qp,
+ raw_packet_qp_state);
+}
+
+static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+ struct ib_qp_attr *qp_attr)
{
- struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
- struct mlx5_ib_qp *qp = to_mqp(ibqp);
struct mlx5_query_qp_mbox_out *outb;
struct mlx5_qp_context *context;
int mlx5_state;
int err = 0;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- /*
- * Wait for any outstanding page faults, in case the user frees memory
- * based upon this query's result.
- */
- flush_workqueue(mlx5_ib_page_fault_wq);
-#endif
-
- mutex_lock(&qp->mutex);
outb = kzalloc(sizeof(*outb), GFP_KERNEL);
- if (!outb) {
- err = -ENOMEM;
- goto out;
- }
+ if (!outb)
+ return -ENOMEM;
+
context = &outb->ctx;
- err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb));
+ err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb,
+ sizeof(*outb));
if (err)
- goto out_free;
+ goto out;
mlx5_state = be32_to_cpu(context->flags) >> 28;
qp->state = to_ib_qp_state(mlx5_state);
- qp_attr->qp_state = qp->state;
qp_attr->path_mtu = context->mtu_msgmax >> 5;
qp_attr->path_mig_state =
to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
@@ -3087,6 +3806,43 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7;
qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7;
qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3;
+
+out:
+ kfree(outb);
+ return err;
+}
+
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx5_ib_qp *qp = to_mqp(ibqp);
+ int err = 0;
+ u8 raw_packet_qp_state;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ /*
+ * Wait for any outstanding page faults, in case the user frees memory
+ * based upon this query's result.
+ */
+ flush_workqueue(mlx5_ib_page_fault_wq);
+#endif
+
+ mutex_lock(&qp->mutex);
+
+ if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+ err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
+ if (err)
+ goto out;
+ qp->state = raw_packet_qp_state;
+ qp_attr->port_num = 1;
+ } else {
+ err = query_qp_attr(dev, qp, qp_attr);
+ if (err)
+ goto out;
+ }
+
+ qp_attr->qp_state = qp->state;
qp_attr->cur_qp_state = qp_attr->qp_state;
qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt;
qp_attr->cap.max_recv_sge = qp->rq.max_gs;
@@ -3110,12 +3866,16 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+ if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+ qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL;
+ if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+ qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND;
+ if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+ qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
+
qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
-out_free:
- kfree(outb);
-
out:
mutex_unlock(&qp->mutex);
return err;
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index e008505..4659256 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -78,28 +78,41 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
struct ib_udata *udata, int buf_size, int *inlen)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
- struct mlx5_ib_create_srq ucmd;
+ struct mlx5_ib_create_srq ucmd = {};
size_t ucmdlen;
+ void *xsrqc;
int err;
int npages;
int page_shift;
int ncont;
u32 offset;
+ u32 uidx = MLX5_IB_DEFAULT_UIDX;
+ int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
- ucmdlen =
- (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) <
- sizeof(ucmd)) ? (sizeof(ucmd) -
- sizeof(ucmd.reserved)) : sizeof(ucmd);
+ if (drv_data < 0)
+ return -EINVAL;
+
+ ucmdlen = (drv_data < sizeof(ucmd)) ?
+ drv_data : sizeof(ucmd);
if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
mlx5_ib_dbg(dev, "failed copy udata\n");
return -EFAULT;
}
- if (ucmdlen == sizeof(ucmd) &&
- ucmd.reserved != 0)
+ if (ucmd.reserved0 || ucmd.reserved1)
return -EINVAL;
+ if (drv_data > sizeof(ucmd) &&
+ !ib_is_udata_cleared(udata, sizeof(ucmd),
+ drv_data - sizeof(ucmd)))
+ return -EINVAL;
+
+ err = get_srq_user_index(to_mucontext(pd->uobject->context),
+ &ucmd, udata->inlen, &uidx);
+ if (err)
+ return err;
+
srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
@@ -138,6 +151,12 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
+ if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+ xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
+ xrc_srq_context_entry);
+ MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
+ }
+
return 0;
err_in:
@@ -158,6 +177,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
struct mlx5_wqe_srq_next_seg *next;
int page_shift;
int npages;
+ void *xsrqc;
err = mlx5_db_alloc(dev->mdev, &srq->db);
if (err) {
@@ -204,6 +224,13 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+ if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+ xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
+ xrc_srq_context_entry);
+ /* 0xffffff means we ask to work with cqe version 0 */
+ MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX);
+ }
+
return 0;
err_in:
diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h
index 76fb7b9..b94a554 100644
--- a/drivers/infiniband/hw/mlx5/user.h
+++ b/drivers/infiniband/hw/mlx5/user.h
@@ -35,6 +35,8 @@
#include <linux/types.h>
+#include "mlx5_ib.h"
+
enum {
MLX5_QP_FLAG_SIGNATURE = 1 << 0,
MLX5_QP_FLAG_SCATTER_CQE = 1 << 1,
@@ -66,7 +68,15 @@ struct mlx5_ib_alloc_ucontext_req_v2 {
__u32 total_num_uuars;
__u32 num_low_latency_uuars;
__u32 flags;
- __u32 reserved;
+ __u32 comp_mask;
+ __u8 max_cqe_version;
+ __u8 reserved0;
+ __u16 reserved1;
+ __u32 reserved2;
+};
+
+enum mlx5_ib_alloc_ucontext_resp_mask {
+ MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
};
struct mlx5_ib_alloc_ucontext_resp {
@@ -80,7 +90,13 @@ struct mlx5_ib_alloc_ucontext_resp {
__u32 max_recv_wr;
__u32 max_srq_recv_wr;
__u16 num_ports;
- __u16 reserved;
+ __u16 reserved1;
+ __u32 comp_mask;
+ __u32 response_length;
+ __u8 cqe_version;
+ __u8 reserved2;
+ __u16 reserved3;
+ __u64 hca_core_clock_offset;
};
struct mlx5_ib_alloc_pd_resp {
@@ -110,7 +126,9 @@ struct mlx5_ib_create_srq {
__u64 buf_addr;
__u64 db_addr;
__u32 flags;
- __u32 reserved; /* explicit padding (optional on i386) */
+ __u32 reserved0; /* explicit padding (optional on i386) */
+ __u32 uidx;
+ __u32 reserved1;
};
struct mlx5_ib_create_srq_resp {
@@ -125,9 +143,48 @@ struct mlx5_ib_create_qp {
__u32 rq_wqe_count;
__u32 rq_wqe_shift;
__u32 flags;
+ __u32 uidx;
+ __u32 reserved0;
+ __u64 sq_buf_addr;
};
struct mlx5_ib_create_qp_resp {
__u32 uuar_index;
};
+
+static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
+ struct mlx5_ib_create_qp *ucmd,
+ int inlen,
+ u32 *user_index)
+{
+ u8 cqe_version = ucontext->cqe_version;
+
+ if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) &&
+ !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+ return 0;
+
+ if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) !=
+ !!cqe_version))
+ return -EINVAL;
+
+ return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
+
+static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext,
+ struct mlx5_ib_create_srq *ucmd,
+ int inlen,
+ u32 *user_index)
+{
+ u8 cqe_version = ucontext->cqe_version;
+
+ if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) &&
+ !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+ return 0;
+
+ if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) !=
+ !!cqe_version))
+ return -EINVAL;
+
+ return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
#endif /* MLX5_IB_USER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index 40ba833..a6531ff 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -608,9 +608,6 @@ static inline int mthca_poll_one(struct mthca_dev *dev,
entry->opcode = IB_WC_FETCH_ADD;
entry->byte_len = MTHCA_ATOMIC_BYTE_LEN;
break;
- case MTHCA_OPCODE_BIND_MW:
- entry->opcode = IB_WC_BIND_MW;
- break;
default:
entry->opcode = MTHCA_OPCODE_INVALID;
break;
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index dc2d48c..9866c35 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -898,89 +898,6 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
return &mr->ibmr;
}
-static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf,
- int acc,
- u64 *iova_start)
-{
- struct mthca_mr *mr;
- u64 *page_list;
- u64 total_size;
- unsigned long mask;
- int shift;
- int npages;
- int err;
- int i, j, n;
-
- mask = buffer_list[0].addr ^ *iova_start;
- total_size = 0;
- for (i = 0; i < num_phys_buf; ++i) {
- if (i != 0)
- mask |= buffer_list[i].addr;
- if (i != num_phys_buf - 1)
- mask |= buffer_list[i].addr + buffer_list[i].size;
-
- total_size += buffer_list[i].size;
- }
-
- if (mask & ~PAGE_MASK)
- return ERR_PTR(-EINVAL);
-
- shift = __ffs(mask | 1 << 31);
-
- buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
- buffer_list[0].addr &= ~0ull << shift;
-
- mr = kmalloc(sizeof *mr, GFP_KERNEL);
- if (!mr)
- return ERR_PTR(-ENOMEM);
-
- npages = 0;
- for (i = 0; i < num_phys_buf; ++i)
- npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
-
- if (!npages)
- return &mr->ibmr;
-
- page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
- if (!page_list) {
- kfree(mr);
- return ERR_PTR(-ENOMEM);
- }
-
- n = 0;
- for (i = 0; i < num_phys_buf; ++i)
- for (j = 0;
- j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
- ++j)
- page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
-
- mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
- "in PD %x; shift %d, npages %d.\n",
- (unsigned long long) buffer_list[0].addr,
- (unsigned long long) *iova_start,
- to_mpd(pd)->pd_num,
- shift, npages);
-
- err = mthca_mr_alloc_phys(to_mdev(pd->device),
- to_mpd(pd)->pd_num,
- page_list, shift, npages,
- *iova_start, total_size,
- convert_access(acc), mr);
-
- if (err) {
- kfree(page_list);
- kfree(mr);
- return ERR_PTR(err);
- }
-
- kfree(page_list);
- mr->umem = NULL;
-
- return &mr->ibmr;
-}
-
static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
@@ -1346,7 +1263,6 @@ int mthca_register_device(struct mthca_dev *dev)
dev->ib_dev.destroy_cq = mthca_destroy_cq;
dev->ib_dev.poll_cq = mthca_poll_cq;
dev->ib_dev.get_dma_mr = mthca_get_dma_mr;
- dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr;
dev->ib_dev.reg_user_mr = mthca_reg_user_mr;
dev->ib_dev.dereg_mr = mthca_dereg_mr;
dev->ib_dev.get_port_immutable = mthca_port_immutable;
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 35fe506..96e5fb9 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1485,7 +1485,7 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
u16 pkey;
ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0,
- mthca_ah_grh_present(to_mah(wr->ah)), 0,
+ mthca_ah_grh_present(to_mah(wr->ah)), 0, 0, 0,
&sqp->ud_header);
err = mthca_read_ah(dev, to_mah(wr->ah), &sqp->ud_header);
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 8a3ad17..cb9f0f2 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -134,7 +134,7 @@ static void record_ird_ord(struct nes_cm_node *, u16, u16);
/* External CM API Interface */
/* instance of function pointers for client API */
/* set address of this instance to cm_core->cm_ops at cm_core alloc */
-static struct nes_cm_ops nes_cm_api = {
+static const struct nes_cm_ops nes_cm_api = {
mini_cm_accelerated,
mini_cm_listen,
mini_cm_del_listen,
@@ -3232,7 +3232,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
int passive_state;
struct nes_ib_device *nesibdev;
struct ib_mr *ibmr = NULL;
- struct ib_phys_buf ibphysbuf;
struct nes_pd *nespd;
u64 tagged_offset;
u8 mpa_frame_offset = 0;
@@ -3316,21 +3315,19 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
u64temp = (unsigned long)nesqp;
nesibdev = nesvnic->nesibdev;
nespd = nesqp->nespd;
- ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset;
- ibphysbuf.size = buff_len;
tagged_offset = (u64)(unsigned long)*start_buff;
- ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd,
- &ibphysbuf, 1,
- IB_ACCESS_LOCAL_WRITE,
- &tagged_offset);
- if (!ibmr) {
+ ibmr = nes_reg_phys_mr(&nespd->ibpd,
+ nesqp->ietf_frame_pbase + mpa_frame_offset,
+ buff_len, IB_ACCESS_LOCAL_WRITE,
+ &tagged_offset);
+ if (IS_ERR(ibmr)) {
nes_debug(NES_DBG_CM, "Unable to register memory region"
"for lSMM for cm_node = %p \n",
cm_node);
pci_free_consistent(nesdev->pcidev,
nesqp->private_data_len + nesqp->ietf_frame_size,
nesqp->ietf_frame, nesqp->ietf_frame_pbase);
- return -ENOMEM;
+ return PTR_ERR(ibmr);
}
ibmr->pd = &nespd->ibpd;
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
index 32a6420..147c2c8 100644
--- a/drivers/infiniband/hw/nes/nes_cm.h
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -423,7 +423,7 @@ struct nes_cm_core {
struct timer_list tcp_timer;
- struct nes_cm_ops *api;
+ const struct nes_cm_ops *api;
int (*post_event)(struct nes_cm_event *event);
atomic_t events_posted;
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index 2042c0f..6d3a169 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -727,7 +727,7 @@ int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 acti
if (action == NES_ARP_DELETE) {
nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
nesadapter->arp_table[arp_index].ip_addr = 0;
- memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN);
+ eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr);
nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
return arp_index;
}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 137880a..8c4daf7 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -206,80 +206,6 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
}
-/**
- * nes_bind_mw
- */
-static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
- struct ib_mw_bind *ibmw_bind)
-{
- u64 u64temp;
- struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
- struct nes_device *nesdev = nesvnic->nesdev;
- /* struct nes_mr *nesmr = to_nesmw(ibmw); */
- struct nes_qp *nesqp = to_nesqp(ibqp);
- struct nes_hw_qp_wqe *wqe;
- unsigned long flags = 0;
- u32 head;
- u32 wqe_misc = 0;
- u32 qsize;
-
- if (nesqp->ibqp_state > IB_QPS_RTS)
- return -EINVAL;
-
- spin_lock_irqsave(&nesqp->lock, flags);
-
- head = nesqp->hwqp.sq_head;
- qsize = nesqp->hwqp.sq_tail;
-
- /* Check for SQ overflow */
- if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
- spin_unlock_irqrestore(&nesqp->lock, flags);
- return -ENOMEM;
- }
-
- wqe = &nesqp->hwqp.sq_vbase[head];
- /* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */
- nes_fill_init_qp_wqe(wqe, nesqp, head);
- u64temp = ibmw_bind->wr_id;
- set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp);
- wqe_misc = NES_IWARP_SQ_OP_BIND;
-
- wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
-
- if (ibmw_bind->send_flags & IB_SEND_SIGNALED)
- wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
-
- if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE)
- wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE;
- if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ)
- wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ;
-
- set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc);
- set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX,
- ibmw_bind->bind_info.mr->lkey);
- set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey);
- set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX,
- ibmw_bind->bind_info.length);
- wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0;
- u64temp = (u64)ibmw_bind->bind_info.addr;
- set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp);
-
- head++;
- if (head >= qsize)
- head = 0;
-
- nesqp->hwqp.sq_head = head;
- barrier();
-
- nes_write32(nesdev->regs+NES_WQE_ALLOC,
- (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
-
- spin_unlock_irqrestore(&nesqp->lock, flags);
-
- return 0;
-}
-
-
/*
* nes_alloc_fast_mr
*/
@@ -2074,9 +2000,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
/**
* nes_reg_phys_mr
*/
-static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
- struct ib_phys_buf *buffer_list, int num_phys_buf, int acc,
- u64 * iova_start)
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
+ int acc, u64 *iova_start)
{
u64 region_length;
struct nes_pd *nespd = to_nespd(ib_pd);
@@ -2088,13 +2013,10 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
struct nes_vpbl vpbl;
struct nes_root_vpbl root_vpbl;
u32 stag;
- u32 i;
unsigned long mask;
u32 stag_index = 0;
u32 next_stag_index = 0;
u32 driver_key = 0;
- u32 root_pbl_index = 0;
- u32 cur_pbl_index = 0;
int err = 0;
int ret = 0;
u16 pbl_count = 0;
@@ -2113,11 +2035,8 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
next_stag_index >>= 8;
next_stag_index %= nesadapter->max_mr;
- if (num_phys_buf > (1024*512)) {
- return ERR_PTR(-E2BIG);
- }
- if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK)
+ if ((addr ^ *iova_start) & ~PAGE_MASK)
return ERR_PTR(-EINVAL);
err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
@@ -2132,84 +2051,33 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
return ERR_PTR(-ENOMEM);
}
- for (i = 0; i < num_phys_buf; i++) {
+ /* Allocate a 4K buffer for the PBL */
+ vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+ &vpbl.pbl_pbase);
+ nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
+ vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
+ if (!vpbl.pbl_vbase) {
+ nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+ ibmr = ERR_PTR(-ENOMEM);
+ kfree(nesmr);
+ goto reg_phys_err;
+ }
- if ((i & 0x01FF) == 0) {
- if (root_pbl_index == 1) {
- /* Allocate the root PBL */
- root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192,
- &root_vpbl.pbl_pbase);
- nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
- root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
- if (!root_vpbl.pbl_vbase) {
- pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
- vpbl.pbl_pbase);
- nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
- kfree(nesmr);
- return ERR_PTR(-ENOMEM);
- }
- root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL);
- if (!root_vpbl.leaf_vpbl) {
- pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
- root_vpbl.pbl_pbase);
- pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
- vpbl.pbl_pbase);
- nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
- kfree(nesmr);
- return ERR_PTR(-ENOMEM);
- }
- root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase);
- root_vpbl.pbl_vbase[0].pa_high =
- cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
- root_vpbl.leaf_vpbl[0] = vpbl;
- }
- /* Allocate a 4K buffer for the PBL */
- vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
- &vpbl.pbl_pbase);
- nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
- vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
- if (!vpbl.pbl_vbase) {
- nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
- ibmr = ERR_PTR(-ENOMEM);
- kfree(nesmr);
- goto reg_phys_err;
- }
- /* Fill in the root table */
- if (1 <= root_pbl_index) {
- root_vpbl.pbl_vbase[root_pbl_index].pa_low =
- cpu_to_le32((u32)vpbl.pbl_pbase);
- root_vpbl.pbl_vbase[root_pbl_index].pa_high =
- cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
- root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
- }
- root_pbl_index++;
- cur_pbl_index = 0;
- }
- mask = !buffer_list[i].size;
- if (i != 0)
- mask |= buffer_list[i].addr;
- if (i != num_phys_buf - 1)
- mask |= buffer_list[i].addr + buffer_list[i].size;
-
- if (mask & ~PAGE_MASK) {
- nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
- nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
- ibmr = ERR_PTR(-EINVAL);
- kfree(nesmr);
- goto reg_phys_err;
- }
+ mask = !size;
- region_length += buffer_list[i].size;
- if ((i != 0) && (single_page)) {
- if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr)
- single_page = 0;
- }
- vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK);
- vpbl.pbl_vbase[cur_pbl_index++].pa_high =
- cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32)));
+ if (mask & ~PAGE_MASK) {
+ nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+ nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
+ ibmr = ERR_PTR(-EINVAL);
+ kfree(nesmr);
+ goto reg_phys_err;
}
+ region_length += size;
+ vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK);
+ vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32)));
+
stag = stag_index << 8;
stag |= driver_key;
stag += (u32)stag_key;
@@ -2219,17 +2087,15 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
/* Make the leaf PBL the root if only one PBL */
- if (root_pbl_index == 1) {
- root_vpbl.pbl_pbase = vpbl.pbl_pbase;
- }
+ root_vpbl.pbl_pbase = vpbl.pbl_pbase;
if (single_page) {
pbl_count = 0;
} else {
- pbl_count = root_pbl_index;
+ pbl_count = 1;
}
ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
- buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start,
+ addr, pbl_count, 1, acc, iova_start,
&nesmr->pbls_used, &nesmr->pbl_4k);
if (ret == 0) {
@@ -2242,21 +2108,9 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
ibmr = ERR_PTR(-ENOMEM);
}
- reg_phys_err:
- /* free the resources */
- if (root_pbl_index == 1) {
- /* single PBL case */
- pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
- } else {
- for (i=0; i<root_pbl_index; i++) {
- pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase,
- root_vpbl.leaf_vpbl[i].pbl_pbase);
- }
- kfree(root_vpbl.leaf_vpbl);
- pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
- root_vpbl.pbl_pbase);
- }
-
+reg_phys_err:
+ /* single PBL case */
+ pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
return ibmr;
}
@@ -2266,17 +2120,13 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
*/
static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
{
- struct ib_phys_buf bl;
u64 kva = 0;
nes_debug(NES_DBG_MR, "\n");
- bl.size = (u64)0xffffffffffULL;
- bl.addr = 0;
- return nes_reg_phys_mr(pd, &bl, 1, acc, &kva);
+ return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
}
-
/**
* nes_reg_user_mr
*/
@@ -3888,12 +3738,10 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
nesibdev->ibdev.destroy_cq = nes_destroy_cq;
nesibdev->ibdev.poll_cq = nes_poll_cq;
nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
- nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr;
nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
nesibdev->ibdev.dereg_mr = nes_dereg_mr;
nesibdev->ibdev.alloc_mw = nes_alloc_mw;
nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
- nesibdev->ibdev.bind_mw = nes_bind_mw;
nesibdev->ibdev.alloc_mr = nes_alloc_mr;
nesibdev->ibdev.map_mr_sg = nes_map_mr_sg;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
index a204b67..7029088 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.h
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -190,4 +190,8 @@ struct nes_qp {
u8 pau_state;
__u64 nesuqp_addr;
};
+
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
+ u64 addr, u64 size, int acc, u64 *iova_start);
+
#endif /* NES_VERBS_H */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 9820074..3790771 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -152,9 +152,10 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
if ((pd->uctx) &&
(!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) &&
(!rdma_link_local_addr((struct in6_addr *)attr->grh.dgid.raw))) {
- status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
- attr->dmac, &vlan_tag,
- sgid_attr.ndev->ifindex);
+ status = rdma_addr_find_l2_eth_by_grh(&sgid, &attr->grh.dgid,
+ attr->dmac, &vlan_tag,
+ &sgid_attr.ndev->ifindex,
+ NULL);
if (status) {
pr_err("%s(): Failed to resolve dmac from gid."
"status = %d\n", __func__, status);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 3afb40b..5738493 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -175,7 +175,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
dev->ibdev.req_notify_cq = ocrdma_arm_cq;
dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
- dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr;
dev->ibdev.dereg_mr = ocrdma_dereg_mr;
dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 76e96f9..d4c687b 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -3066,169 +3066,6 @@ pl_err:
return ERR_PTR(-ENOMEM);
}
-#define MAX_KERNEL_PBE_SIZE 65536
-static inline int count_kernel_pbes(struct ib_phys_buf *buf_list,
- int buf_cnt, u32 *pbe_size)
-{
- u64 total_size = 0;
- u64 buf_size = 0;
- int i;
- *pbe_size = roundup(buf_list[0].size, PAGE_SIZE);
- *pbe_size = roundup_pow_of_two(*pbe_size);
-
- /* find the smallest PBE size that we can have */
- for (i = 0; i < buf_cnt; i++) {
- /* first addr may not be page aligned, so ignore checking */
- if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) ||
- (buf_list[i].size & ~PAGE_MASK))) {
- return 0;
- }
-
- /* if configured PBE size is greater then the chosen one,
- * reduce the PBE size.
- */
- buf_size = roundup(buf_list[i].size, PAGE_SIZE);
- /* pbe_size has to be even multiple of 4K 1,2,4,8...*/
- buf_size = roundup_pow_of_two(buf_size);
- if (*pbe_size > buf_size)
- *pbe_size = buf_size;
-
- total_size += buf_size;
- }
- *pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ?
- (MAX_KERNEL_PBE_SIZE) : (*pbe_size);
-
- /* num_pbes = total_size / (*pbe_size); this is implemented below. */
-
- return total_size >> ilog2(*pbe_size);
-}
-
-static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt,
- u32 pbe_size, struct ocrdma_pbl *pbl_tbl,
- struct ocrdma_hw_mr *hwmr)
-{
- int i;
- int idx;
- int pbes_per_buf = 0;
- u64 buf_addr = 0;
- int num_pbes;
- struct ocrdma_pbe *pbe;
- int total_num_pbes = 0;
-
- if (!hwmr->num_pbes)
- return;
-
- pbe = (struct ocrdma_pbe *)pbl_tbl->va;
- num_pbes = 0;
-
- /* go through the OS phy regions & fill hw pbe entries into pbls. */
- for (i = 0; i < ib_buf_cnt; i++) {
- buf_addr = buf_list[i].addr;
- pbes_per_buf =
- roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) /
- pbe_size;
- hwmr->len += buf_list[i].size;
- /* number of pbes can be more for one OS buf, when
- * buffers are of different sizes.
- * split the ib_buf to one or more pbes.
- */
- for (idx = 0; idx < pbes_per_buf; idx++) {
- /* we program always page aligned addresses,
- * first unaligned address is taken care by fbo.
- */
- if (i == 0) {
- /* for non zero fbo, assign the
- * start of the page.
- */
- pbe->pa_lo =
- cpu_to_le32((u32) (buf_addr & PAGE_MASK));
- pbe->pa_hi =
- cpu_to_le32((u32) upper_32_bits(buf_addr));
- } else {
- pbe->pa_lo =
- cpu_to_le32((u32) (buf_addr & 0xffffffff));
- pbe->pa_hi =
- cpu_to_le32((u32) upper_32_bits(buf_addr));
- }
- buf_addr += pbe_size;
- num_pbes += 1;
- total_num_pbes += 1;
- pbe++;
-
- if (total_num_pbes == hwmr->num_pbes)
- goto mr_tbl_done;
- /* if the pbl is full storing the pbes,
- * move to next pbl.
- */
- if (num_pbes == (hwmr->pbl_size/sizeof(u64))) {
- pbl_tbl++;
- pbe = (struct ocrdma_pbe *)pbl_tbl->va;
- num_pbes = 0;
- }
- }
- }
-mr_tbl_done:
- return;
-}
-
-struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd,
- struct ib_phys_buf *buf_list,
- int buf_cnt, int acc, u64 *iova_start)
-{
- int status = -ENOMEM;
- struct ocrdma_mr *mr;
- struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
- struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
- u32 num_pbes;
- u32 pbe_size = 0;
-
- if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE))
- return ERR_PTR(-EINVAL);
-
- mr = kzalloc(sizeof(*mr), GFP_KERNEL);
- if (!mr)
- return ERR_PTR(status);
-
- num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size);
- if (num_pbes == 0) {
- status = -EINVAL;
- goto pbl_err;
- }
- status = ocrdma_get_pbl_info(dev, mr, num_pbes);
- if (status)
- goto pbl_err;
-
- mr->hwmr.pbe_size = pbe_size;
- mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK);
- mr->hwmr.va = *iova_start;
- mr->hwmr.local_rd = 1;
- mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
- mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
- mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
- mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
- mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
-
- status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
- if (status)
- goto pbl_err;
- build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table,
- &mr->hwmr);
- status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
- if (status)
- goto mbx_err;
-
- mr->ibmr.lkey = mr->hwmr.lkey;
- if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
- mr->ibmr.rkey = mr->hwmr.lkey;
- return &mr->ibmr;
-
-mbx_err:
- ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
-pbl_err:
- kfree(mr);
- return ERR_PTR(status);
-}
-
static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr)
{
struct ocrdma_mr *mr = get_ocrdma_mr(ibmr);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index a2f3b4d..8b517fd 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -117,9 +117,6 @@ int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *,
int ocrdma_dereg_mr(struct ib_mr *);
struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
-struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start);
struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *);
struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c
index 294f5c7..5f53304 100644
--- a/drivers/infiniband/hw/qib/qib_mr.c
+++ b/drivers/infiniband/hw/qib/qib_mr.c
@@ -150,10 +150,7 @@ static struct qib_mr *alloc_mr(int count, struct ib_pd *pd)
rval = init_qib_mregion(&mr->mr, pd, count);
if (rval)
goto bail;
- /*
- * ib_reg_phys_mr() will initialize mr->ibmr except for
- * lkey and rkey.
- */
+
rval = qib_alloc_lkey(&mr->mr, 0);
if (rval)
goto bail_mregion;
@@ -171,52 +168,6 @@ bail:
}
/**
- * qib_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start)
-{
- struct qib_mr *mr;
- int n, m, i;
- struct ib_mr *ret;
-
- mr = alloc_mr(num_phys_buf, pd);
- if (IS_ERR(mr)) {
- ret = (struct ib_mr *)mr;
- goto bail;
- }
-
- mr->mr.user_base = *iova_start;
- mr->mr.iova = *iova_start;
- mr->mr.access_flags = acc;
-
- m = 0;
- n = 0;
- for (i = 0; i < num_phys_buf; i++) {
- mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
- mr->mr.map[m]->segs[n].length = buffer_list[i].size;
- mr->mr.length += buffer_list[i].size;
- n++;
- if (n == QIB_SEGSZ) {
- m++;
- n = 0;
- }
- }
-
- ret = &mr->ibmr;
-
-bail:
- return ret;
-}
-
-/**
* qib_reg_user_mr - register a userspace memory region
* @pd: protection domain for this memory region
* @start: starting userspace address
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
index 40f85bb..3eff35c 100644
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -100,9 +100,10 @@ static u32 credit_table[31] = {
32768 /* 1E */
};
-static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
+static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map,
+ gfp_t gfp)
{
- unsigned long page = get_zeroed_page(GFP_KERNEL);
+ unsigned long page = get_zeroed_page(gfp);
/*
* Free the page if someone raced with us installing it.
@@ -121,7 +122,7 @@ static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
* zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
*/
static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
- enum ib_qp_type type, u8 port)
+ enum ib_qp_type type, u8 port, gfp_t gfp)
{
u32 i, offset, max_scan, qpn;
struct qpn_map *map;
@@ -151,7 +152,7 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
max_scan = qpt->nmaps - !offset;
for (i = 0;;) {
if (unlikely(!map->page)) {
- get_map_page(qpt, map);
+ get_map_page(qpt, map, gfp);
if (unlikely(!map->page))
break;
}
@@ -983,13 +984,21 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
size_t sz;
size_t sg_list_sz;
struct ib_qp *ret;
+ gfp_t gfp;
+
if (init_attr->cap.max_send_sge > ib_qib_max_sges ||
init_attr->cap.max_send_wr > ib_qib_max_qp_wrs ||
- init_attr->create_flags) {
- ret = ERR_PTR(-EINVAL);
- goto bail;
- }
+ init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO))
+ return ERR_PTR(-EINVAL);
+
+ /* GFP_NOIO is applicable in RC QPs only */
+ if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO &&
+ init_attr->qp_type != IB_QPT_RC)
+ return ERR_PTR(-EINVAL);
+
+ gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ?
+ GFP_NOIO : GFP_KERNEL;
/* Check receive queue parameters if no SRQ is specified. */
if (!init_attr->srq) {
@@ -1021,7 +1030,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
sz = sizeof(struct qib_sge) *
init_attr->cap.max_send_sge +
sizeof(struct qib_swqe);
- swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+ swq = __vmalloc((init_attr->cap.max_send_wr + 1) * sz,
+ gfp, PAGE_KERNEL);
if (swq == NULL) {
ret = ERR_PTR(-ENOMEM);
goto bail;
@@ -1037,13 +1047,13 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
} else if (init_attr->cap.max_recv_sge > 1)
sg_list_sz = sizeof(*qp->r_sg_list) *
(init_attr->cap.max_recv_sge - 1);
- qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+ qp = kzalloc(sz + sg_list_sz, gfp);
if (!qp) {
ret = ERR_PTR(-ENOMEM);
goto bail_swq;
}
RCU_INIT_POINTER(qp->next, NULL);
- qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+ qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), gfp);
if (!qp->s_hdr) {
ret = ERR_PTR(-ENOMEM);
goto bail_qp;
@@ -1058,8 +1068,16 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
sizeof(struct qib_rwqe);
- qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) +
- qp->r_rq.size * sz);
+ if (gfp != GFP_NOIO)
+ qp->r_rq.wq = vmalloc_user(
+ sizeof(struct qib_rwq) +
+ qp->r_rq.size * sz);
+ else
+ qp->r_rq.wq = __vmalloc(
+ sizeof(struct qib_rwq) +
+ qp->r_rq.size * sz,
+ gfp, PAGE_KERNEL);
+
if (!qp->r_rq.wq) {
ret = ERR_PTR(-ENOMEM);
goto bail_qp;
@@ -1090,7 +1108,7 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
dev = to_idev(ibpd->device);
dd = dd_from_dev(dev);
err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type,
- init_attr->port_num);
+ init_attr->port_num, gfp);
if (err < 0) {
ret = ERR_PTR(err);
vfree(qp->r_rq.wq);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index de6cb6f..baf1e42 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -346,6 +346,7 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
unsigned long flags;
struct qib_lkey_table *rkt;
struct qib_pd *pd;
+ int avoid_schedule = 0;
spin_lock_irqsave(&qp->s_lock, flags);
@@ -438,11 +439,15 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
qp->ibqp.qp_type == IB_QPT_RC) {
if (wqe->length > 0x80000000U)
goto bail_inval_free;
+ if (wqe->length <= qp->pmtu)
+ avoid_schedule = 1;
} else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport +
- qp->port_num - 1)->ibmtu)
+ qp->port_num - 1)->ibmtu) {
goto bail_inval_free;
- else
+ } else {
atomic_inc(&to_iah(ud_wr(wr)->ah)->refcount);
+ avoid_schedule = 1;
+ }
wqe->ssn = qp->s_ssn++;
qp->s_head = next;
@@ -458,7 +463,7 @@ bail_inval_free:
bail_inval:
ret = -EINVAL;
bail:
- if (!ret && !wr->next &&
+ if (!ret && !wr->next && !avoid_schedule &&
!qib_sdma_empty(
dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) {
qib_schedule_send(qp);
@@ -2256,7 +2261,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
ibdev->poll_cq = qib_poll_cq;
ibdev->req_notify_cq = qib_req_notify_cq;
ibdev->get_dma_mr = qib_get_dma_mr;
- ibdev->reg_phys_mr = qib_reg_phys_mr;
ibdev->reg_user_mr = qib_reg_user_mr;
ibdev->dereg_mr = qib_dereg_mr;
ibdev->alloc_mr = qib_alloc_mr;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index bc803f3..6c5e777 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -1032,10 +1032,6 @@ int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc);
-struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start);
-
struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
index f8ea069..b2fb528 100644
--- a/drivers/infiniband/hw/qib/qib_verbs_mcast.c
+++ b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
@@ -286,15 +286,13 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
struct qib_ibdev *dev = to_idev(ibqp->device);
struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num);
struct qib_mcast *mcast = NULL;
- struct qib_mcast_qp *p, *tmp;
+ struct qib_mcast_qp *p, *tmp, *delp = NULL;
struct rb_node *n;
int last = 0;
int ret;
- if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
- ret = -EINVAL;
- goto bail;
- }
+ if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET)
+ return -EINVAL;
spin_lock_irq(&ibp->lock);
@@ -303,8 +301,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
while (1) {
if (n == NULL) {
spin_unlock_irq(&ibp->lock);
- ret = -EINVAL;
- goto bail;
+ return -EINVAL;
}
mcast = rb_entry(n, struct qib_mcast, rb_node);
@@ -328,6 +325,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
*/
list_del_rcu(&p->list);
mcast->n_attached--;
+ delp = p;
/* If this was the last attached QP, remove the GID too. */
if (list_empty(&mcast->qp_list)) {
@@ -338,15 +336,16 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
}
spin_unlock_irq(&ibp->lock);
+ /* QP not attached */
+ if (!delp)
+ return -EINVAL;
+ /*
+ * Wait for any list walkers to finish before freeing the
+ * list element.
+ */
+ wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+ qib_mcast_qp_free(delp);
- if (p) {
- /*
- * Wait for any list walkers to finish before freeing the
- * list element.
- */
- wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
- qib_mcast_qp_free(p);
- }
if (last) {
atomic_dec(&mcast->refcount);
wait_event(mcast->wait, !atomic_read(&mcast->refcount));
@@ -355,11 +354,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
dev->n_mcast_grps_allocated--;
spin_unlock_irq(&dev->n_mcast_grps_lock);
}
-
- ret = 0;
-
-bail:
- return ret;
+ return 0;
}
int qib_mcast_tree_empty(struct qib_ibport *ibp)
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c
index 5e55b8b..92dc66c 100644
--- a/drivers/infiniband/hw/usnic/usnic_debugfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c
@@ -157,8 +157,9 @@ void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
qp_flow,
&flowinfo_ops);
if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
- usnic_err("Failed to create dbg fs entry for flow %u\n",
- qp_flow->flow->flow_id);
+ usnic_err("Failed to create dbg fs entry for flow %u with error %ld\n",
+ qp_flow->flow->flow_id,
+ PTR_ERR(qp_flow->dbgfs_dentry));
}
}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
index fcea3a2..5f44b66 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -521,7 +521,7 @@ int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
if (!status) {
qp_grp->state = new_state;
- usnic_info("Transistioned %u from %s to %s",
+ usnic_info("Transitioned %u from %s to %s",
qp_grp->grp_id,
usnic_ib_qp_grp_state_to_string(old_state),
usnic_ib_qp_grp_state_to_string(new_state));
@@ -575,7 +575,7 @@ alloc_res_chunk_list(struct usnic_vnic *vnic,
return res_chunk_list;
out_free_res:
- for (i--; i > 0; i--)
+ for (i--; i >= 0; i--)
usnic_vnic_put_resources(res_chunk_list[i]);
kfree(res_chunk_list);
return ERR_PTR(err);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index f8e3211..6cdb4d2 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -51,7 +51,7 @@
static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver)
{
- *fw_ver = (u64) *fw_ver_str;
+ *fw_ver = *((u64 *)fw_ver_str);
}
static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp,
@@ -571,20 +571,20 @@ int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
qp_grp = to_uqp_grp(ibqp);
- /* TODO: Future Support All States */
mutex_lock(&qp_grp->vf->pf->usdev_lock);
- if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) {
- status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL);
- } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) {
- status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL);
- } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) {
- status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL);
+ if ((attr_mask & IB_QP_PORT) && attr->port_num != 1) {
+ /* usnic devices only have one port */
+ status = -EINVAL;
+ goto out_unlock;
+ }
+ if (attr_mask & IB_QP_STATE) {
+ status = usnic_ib_qp_grp_modify(qp_grp, attr->qp_state, NULL);
} else {
- usnic_err("Unexpected combination mask: %u state: %u\n",
- attr_mask & IB_QP_STATE, attr->qp_state);
+ usnic_err("Unhandled request, attr_mask=0x%x\n", attr_mask);
status = -EINVAL;
}
+out_unlock:
mutex_unlock(&qp_grp->vf->pf->usdev_lock);
return status;
}
@@ -625,8 +625,8 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
virt_addr, length);
mr = kzalloc(sizeof(*mr), GFP_KERNEL);
- if (IS_ERR_OR_NULL(mr))
- return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length,
access_flags, 0);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 414eaa5..0d9d2e6a 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -43,8 +43,6 @@ int usnic_ib_query_device(struct ib_device *ibdev,
struct ib_udata *uhw);
int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
struct ib_port_attr *props);
-enum rdma_protocol_type
-usnic_ib_query_protocol(struct ib_device *device, u8 port_num);
int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
int qp_attr_mask,
struct ib_qp_init_attr *qp_init_attr);
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c
index 66de93f..8875107 100644
--- a/drivers/infiniband/hw/usnic/usnic_vnic.c
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.c
@@ -237,7 +237,7 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
struct usnic_vnic_res *res;
int i;
- if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner)
+ if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 0 || !owner)
return ERR_PTR(-EINVAL);
ret = kzalloc(sizeof(*ret), GFP_ATOMIC);
@@ -247,26 +247,28 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
return ERR_PTR(-ENOMEM);
}
- ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC);
- if (!ret->res) {
- usnic_err("Failed to allocate resources for %s. Out of memory\n",
- usnic_vnic_pci_name(vnic));
- kfree(ret);
- return ERR_PTR(-ENOMEM);
- }
+ if (cnt > 0) {
+ ret->res = kcalloc(cnt, sizeof(*(ret->res)), GFP_ATOMIC);
+ if (!ret->res) {
+ usnic_err("Failed to allocate resources for %s. Out of memory\n",
+ usnic_vnic_pci_name(vnic));
+ kfree(ret);
+ return ERR_PTR(-ENOMEM);
+ }
- spin_lock(&vnic->res_lock);
- src = &vnic->chunks[type];
- for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
- res = src->res[i];
- if (!res->owner) {
- src->free_cnt--;
- res->owner = owner;
- ret->res[ret->cnt++] = res;
+ spin_lock(&vnic->res_lock);
+ src = &vnic->chunks[type];
+ for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
+ res = src->res[i];
+ if (!res->owner) {
+ src->free_cnt--;
+ res->owner = owner;
+ ret->res[ret->cnt++] = res;
+ }
}
- }
- spin_unlock(&vnic->res_lock);
+ spin_unlock(&vnic->res_lock);
+ }
ret->type = type;
ret->vnic = vnic;
WARN_ON(ret->cnt != cnt);
@@ -281,14 +283,16 @@ void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk)
int i;
struct usnic_vnic *vnic = chunk->vnic;
- spin_lock(&vnic->res_lock);
- while ((i = --chunk->cnt) >= 0) {
- res = chunk->res[i];
- chunk->res[i] = NULL;
- res->owner = NULL;
- vnic->chunks[res->type].free_cnt++;
+ if (chunk->cnt > 0) {
+ spin_lock(&vnic->res_lock);
+ while ((i = --chunk->cnt) >= 0) {
+ res = chunk->res[i];
+ chunk->res[i] = NULL;
+ res->owner = NULL;
+ vnic->chunks[res->type].free_cnt++;
+ }
+ spin_unlock(&vnic->res_lock);
}
- spin_unlock(&vnic->res_lock);
kfree(chunk->res);
kfree(chunk);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 3ede103..a6f3eab 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -495,7 +495,6 @@ void ipoib_dev_cleanup(struct net_device *dev);
void ipoib_mcast_join_task(struct work_struct *work);
void ipoib_mcast_carrier_on_task(struct work_struct *work);
void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
-void ipoib_mcast_free(struct ipoib_mcast *mc);
void ipoib_mcast_restart_task(struct work_struct *work);
int ipoib_mcast_start_thread(struct net_device *dev);
@@ -549,8 +548,9 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
union ib_gid *mgid, int set_qkey);
-int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast);
-struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid);
+void ipoib_mcast_remove_list(struct list_head *remove_list);
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+ struct list_head *remove_list);
int ipoib_init_qp(struct net_device *dev);
int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 3ae9726..917e46e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
static struct ib_send_wr ipoib_cm_rx_drain_wr = {
- .wr_id = IPOIB_CM_RX_DRAIN_WRID,
.opcode = IB_WR_SEND,
};
@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
* error" WC will be immediately generated for each WR we post.
*/
p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+ ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
ipoib_warn(priv, "failed to post drain wr\n");
@@ -1522,8 +1522,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
int ipoib_cm_dev_init(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- int i, ret;
- struct ib_device_attr attr;
+ int max_srq_sge, i;
INIT_LIST_HEAD(&priv->cm.passive_ids);
INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1540,19 +1539,13 @@ int ipoib_cm_dev_init(struct net_device *dev)
skb_queue_head_init(&priv->cm.skb_queue);
- ret = ib_query_device(priv->ca, &attr);
- if (ret) {
- printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
- return ret;
- }
-
- ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+ ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
- attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
- ipoib_cm_create_srq(dev, attr.max_srq_sge);
+ max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
+ ipoib_cm_create_srq(dev, max_srq_sge);
if (ipoib_cm_has_srq(dev)) {
- priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
- priv->cm.num_frags = attr.max_srq_sge;
+ priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
+ priv->cm.num_frags = max_srq_sge;
ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
priv->cm.max_cm_mtu, priv->cm.num_frags);
} else {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 078cadd..a53fa5f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -40,15 +40,11 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *drvinfo)
{
struct ipoib_dev_priv *priv = netdev_priv(netdev);
- struct ib_device_attr *attr;
-
- attr = kmalloc(sizeof(*attr), GFP_KERNEL);
- if (attr && !ib_query_device(priv->ca, attr))
- snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
- "%d.%d.%d", (int)(attr->fw_ver >> 32),
- (int)(attr->fw_ver >> 16) & 0xffff,
- (int)attr->fw_ver & 0xffff);
- kfree(attr);
+
+ snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+ "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32),
+ (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff,
+ (int)priv->ca->attrs.fw_ver & 0xffff);
strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
sizeof(drvinfo->bus_info));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 7d32818..25509bb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1150,8 +1150,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
unsigned long flags;
int i;
LIST_HEAD(remove_list);
- struct ipoib_mcast *mcast, *tmcast;
- struct net_device *dev = priv->dev;
if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
return;
@@ -1179,18 +1177,8 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
lockdep_is_held(&priv->lock))) != NULL) {
/* was the neigh idle for two GC periods */
if (time_after(neigh_obsolete, neigh->alive)) {
- u8 *mgid = neigh->daddr + 4;
- /* Is this multicast ? */
- if (*mgid == 0xff) {
- mcast = __ipoib_mcast_find(dev, mgid);
-
- if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
- list_del(&mcast->list);
- rb_erase(&mcast->rb_node, &priv->multicast_tree);
- list_add_tail(&mcast->list, &remove_list);
- }
- }
+ ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
rcu_assign_pointer(*np,
rcu_dereference_protected(neigh->hnext,
@@ -1207,10 +1195,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
out_unlock:
spin_unlock_irqrestore(&priv->lock, flags);
- list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
- ipoib_mcast_leave(dev, mcast);
- ipoib_mcast_free(mcast);
- }
+ ipoib_mcast_remove_list(&remove_list);
}
static void ipoib_reap_neigh(struct work_struct *work)
@@ -1777,26 +1762,7 @@ int ipoib_add_pkey_attr(struct net_device *dev)
int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
{
- struct ib_device_attr *device_attr;
- int result = -ENOMEM;
-
- device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
- if (!device_attr) {
- printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
- hca->name, sizeof *device_attr);
- return result;
- }
-
- result = ib_query_device(hca, device_attr);
- if (result) {
- printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
- hca->name, result);
- kfree(device_attr);
- return result;
- }
- priv->hca_caps = device_attr->device_cap_flags;
-
- kfree(device_attr);
+ priv->hca_caps = hca->attrs.device_cap_flags;
if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
priv->dev->hw_features = NETIF_F_SG |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index f357ca6..050dfa1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -106,7 +106,7 @@ static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
queue_delayed_work(priv->wq, &priv->mcast_task, 0);
}
-void ipoib_mcast_free(struct ipoib_mcast *mcast)
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
{
struct net_device *dev = mcast->dev;
int tx_dropped = 0;
@@ -153,7 +153,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
return mcast;
}
-struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct rb_node *n = priv->multicast_tree.rb_node;
@@ -677,7 +677,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev)
return 0;
}
-int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret = 0;
@@ -704,6 +704,35 @@ int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
return 0;
}
+/*
+ * Check if the multicast group is sendonly. If so remove it from the maps
+ * and add to the remove list
+ */
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+ struct list_head *remove_list)
+{
+ /* Is this multicast ? */
+ if (*mgid == 0xff) {
+ struct ipoib_mcast *mcast = __ipoib_mcast_find(priv->dev, mgid);
+
+ if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ list_del(&mcast->list);
+ rb_erase(&mcast->rb_node, &priv->multicast_tree);
+ list_add_tail(&mcast->list, remove_list);
+ }
+ }
+}
+
+void ipoib_mcast_remove_list(struct list_head *remove_list)
+{
+ struct ipoib_mcast *mcast, *tmcast;
+
+ list_for_each_entry_safe(mcast, tmcast, remove_list, list) {
+ ipoib_mcast_leave(mcast->dev, mcast);
+ ipoib_mcast_free(mcast);
+ }
+}
+
void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -810,10 +839,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
wait_for_completion(&mcast->done);
- list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
- ipoib_mcast_leave(dev, mcast);
- ipoib_mcast_free(mcast);
- }
+ ipoib_mcast_remove_list(&remove_list);
}
static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast)
@@ -939,10 +965,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
wait_for_completion(&mcast->done);
- list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
- ipoib_mcast_leave(mcast->dev, mcast);
- ipoib_mcast_free(mcast);
- }
+ ipoib_mcast_remove_list(&remove_list);
/*
* Double check that we are still up
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 9080161..c827c93 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -644,7 +644,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
ib_conn = &iser_conn->ib_conn;
if (ib_conn->pi_support) {
- u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
+ u32 sig_caps = ib_conn->device->ib_device->attrs.sig_prot_cap;
scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
@@ -656,7 +656,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
* max fastreg page list length.
*/
shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
- ib_conn->device->dev_attr.max_fast_reg_page_list_len);
+ ib_conn->device->ib_device->attrs.max_fast_reg_page_list_len);
shost->max_sectors = min_t(unsigned int,
1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);
@@ -1059,7 +1059,8 @@ static int __init iser_init(void)
release_wq = alloc_workqueue("release workqueue", 0, 0);
if (!release_wq) {
iser_err("failed to allocate release workqueue\n");
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_alloc_wq;
}
iscsi_iser_scsi_transport = iscsi_register_transport(
@@ -1067,12 +1068,14 @@ static int __init iser_init(void)
if (!iscsi_iser_scsi_transport) {
iser_err("iscsi_register_transport failed\n");
err = -EINVAL;
- goto register_transport_failure;
+ goto err_reg;
}
return 0;
-register_transport_failure:
+err_reg:
+ destroy_workqueue(release_wq);
+err_alloc_wq:
kmem_cache_destroy(ig.desc_cache);
return err;
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 8a5998e..95f0a64 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -48,6 +48,7 @@
#include <scsi/scsi_transport_iscsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_device.h>
+#include <scsi/iser.h>
#include <linux/interrupt.h>
#include <linux/wait.h>
@@ -151,46 +152,10 @@
- ISER_MAX_RX_MISC_PDUS) / \
(1 + ISER_INFLIGHT_DATAOUTS))
-#define ISER_WC_BATCH_COUNT 16
#define ISER_SIGNAL_CMD_COUNT 32
-#define ISER_VER 0x10
-#define ISER_WSV 0x08
-#define ISER_RSV 0x04
-
-#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL
-#define ISER_BEACON_WRID 0xfffffffffffffffeULL
-
-/**
- * struct iser_hdr - iSER header
- *
- * @flags: flags support (zbva, remote_inv)
- * @rsvd: reserved
- * @write_stag: write rkey
- * @write_va: write virtual address
- * @reaf_stag: read rkey
- * @read_va: read virtual address
- */
-struct iser_hdr {
- u8 flags;
- u8 rsvd[3];
- __be32 write_stag;
- __be64 write_va;
- __be32 read_stag;
- __be64 read_va;
-} __attribute__((packed));
-
-
-#define ISER_ZBVA_NOT_SUPPORTED 0x80
-#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40
-
-struct iser_cm_hdr {
- u8 flags;
- u8 rsvd[3];
-} __packed;
-
/* Constant PDU lengths calculations */
-#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+#define ISER_HEADERS_LEN (sizeof(struct iser_ctrl) + sizeof(struct iscsi_hdr))
#define ISER_RECV_DATA_SEG_LEN 128
#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
@@ -269,7 +234,7 @@ enum iser_desc_type {
#define ISER_MAX_WRS 7
/**
- * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ * struct iser_tx_desc - iSER TX descriptor
*
* @iser_header: iser header
* @iscsi_header: iscsi header
@@ -287,12 +252,13 @@ enum iser_desc_type {
* @sig_attrs: Signature attributes
*/
struct iser_tx_desc {
- struct iser_hdr iser_header;
+ struct iser_ctrl iser_header;
struct iscsi_hdr iscsi_header;
enum iser_desc_type type;
u64 dma_addr;
struct ib_sge tx_sg[2];
int num_sge;
+ struct ib_cqe cqe;
bool mapped;
u8 wr_idx;
union iser_wr {
@@ -306,9 +272,10 @@ struct iser_tx_desc {
};
#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \
- sizeof(u64) + sizeof(struct ib_sge)))
+ sizeof(u64) + sizeof(struct ib_sge) + \
+ sizeof(struct ib_cqe)))
/**
- * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ * struct iser_rx_desc - iSER RX descriptor
*
* @iser_header: iser header
* @iscsi_header: iscsi header
@@ -318,12 +285,32 @@ struct iser_tx_desc {
* @pad: for sense data TODO: Modify to maximum sense length supported
*/
struct iser_rx_desc {
- struct iser_hdr iser_header;
+ struct iser_ctrl iser_header;
struct iscsi_hdr iscsi_header;
char data[ISER_RECV_DATA_SEG_LEN];
u64 dma_addr;
struct ib_sge rx_sg;
+ struct ib_cqe cqe;
char pad[ISER_RX_PAD_SIZE];
+} __packed;
+
+/**
+ * struct iser_login_desc - iSER login descriptor
+ *
+ * @req: pointer to login request buffer
+ * @resp: pointer to login response buffer
+ * @req_dma: DMA address of login request buffer
+ * @rsp_dma: DMA address of login response buffer
+ * @sge: IB sge for login post recv
+ * @cqe: completion handler
+ */
+struct iser_login_desc {
+ void *req;
+ void *rsp;
+ u64 req_dma;
+ u64 rsp_dma;
+ struct ib_sge sge;
+ struct ib_cqe cqe;
} __attribute__((packed));
struct iser_conn;
@@ -333,18 +320,12 @@ struct iscsi_iser_task;
/**
* struct iser_comp - iSER completion context
*
- * @device: pointer to device handle
* @cq: completion queue
- * @wcs: work completion array
- * @tasklet: Tasklet handle
* @active_qps: Number of active QPs attached
* to completion context
*/
struct iser_comp {
- struct iser_device *device;
struct ib_cq *cq;
- struct ib_wc wcs[ISER_WC_BATCH_COUNT];
- struct tasklet_struct tasklet;
int active_qps;
};
@@ -380,7 +361,6 @@ struct iser_reg_ops {
*
* @ib_device: RDMA device
* @pd: Protection Domain for this device
- * @dev_attr: Device attributes container
* @mr: Global DMA memory region
* @event_handler: IB events handle routine
* @ig_list: entry in devices list
@@ -389,18 +369,19 @@ struct iser_reg_ops {
* cpus and device max completion vectors
* @comps: Dinamically allocated array of completion handlers
* @reg_ops: Registration ops
+ * @remote_inv_sup: Remote invalidate is supported on this device
*/
struct iser_device {
struct ib_device *ib_device;
struct ib_pd *pd;
- struct ib_device_attr dev_attr;
struct ib_mr *mr;
struct ib_event_handler event_handler;
struct list_head ig_list;
int refcount;
int comps_used;
struct iser_comp *comps;
- struct iser_reg_ops *reg_ops;
+ const struct iser_reg_ops *reg_ops;
+ bool remote_inv_sup;
};
#define ISER_CHECK_GUARD 0xc0
@@ -475,10 +456,11 @@ struct iser_fr_pool {
* @rx_wr: receive work request for batch posts
* @device: reference to iser device
* @comp: iser completion context
- * @pi_support: Indicate device T10-PI support
- * @beacon: beacon send wr to signal all flush errors were drained
- * @flush_comp: completes when all connection completions consumed
* @fr_pool: connection fast registration poool
+ * @pi_support: Indicate device T10-PI support
+ * @last: last send wr to signal all flush errors were drained
+ * @last_cqe: cqe handler for last wr
+ * @last_comp: completes when all connection completions consumed
*/
struct ib_conn {
struct rdma_cm_id *cma_id;
@@ -488,10 +470,12 @@ struct ib_conn {
struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX];
struct iser_device *device;
struct iser_comp *comp;
- bool pi_support;
- struct ib_send_wr beacon;
- struct completion flush_comp;
struct iser_fr_pool fr_pool;
+ bool pi_support;
+ struct ib_send_wr last;
+ struct ib_cqe last_cqe;
+ struct ib_cqe reg_cqe;
+ struct completion last_comp;
};
/**
@@ -514,11 +498,7 @@ struct ib_conn {
* @up_completion: connection establishment completed
* (state is ISER_CONN_UP)
* @conn_list: entry in ig conn list
- * @login_buf: login data buffer (stores login parameters)
- * @login_req_buf: login request buffer
- * @login_req_dma: login request buffer dma address
- * @login_resp_buf: login response buffer
- * @login_resp_dma: login response buffer dma address
+ * @login_desc: login descriptor
* @rx_desc_head: head of rx_descs cyclic buffer
* @rx_descs: rx buffers array (cyclic buffer)
* @num_rx_descs: number of rx descriptors
@@ -541,15 +521,13 @@ struct iser_conn {
struct completion ib_completion;
struct completion up_completion;
struct list_head conn_list;
-
- char *login_buf;
- char *login_req_buf, *login_resp_buf;
- u64 login_req_dma, login_resp_dma;
+ struct iser_login_desc login_desc;
unsigned int rx_desc_head;
struct iser_rx_desc *rx_descs;
u32 num_rx_descs;
unsigned short scsi_sg_tablesize;
unsigned int scsi_max_sectors;
+ bool snd_w_inv;
};
/**
@@ -579,9 +557,8 @@ struct iscsi_iser_task {
struct iser_page_vec {
u64 *pages;
- int length;
- int offset;
- int data_size;
+ int npages;
+ struct ib_mr fake_mr;
};
/**
@@ -633,12 +610,14 @@ int iser_conn_terminate(struct iser_conn *iser_conn);
void iser_release_work(struct work_struct *work);
-void iser_rcv_completion(struct iser_rx_desc *desc,
- unsigned long dto_xfer_len,
- struct ib_conn *ib_conn);
-
-void iser_snd_completion(struct iser_tx_desc *desc,
- struct ib_conn *ib_conn);
+void iser_err_comp(struct ib_wc *wc, const char *type);
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc);
void iser_task_rdma_init(struct iscsi_iser_task *task);
@@ -651,7 +630,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir);
int iser_reg_rdma_mem(struct iscsi_iser_task *task,
- enum iser_data_dir dir);
+ enum iser_data_dir dir,
+ bool all_imm);
void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
enum iser_data_dir dir);
@@ -719,4 +699,28 @@ iser_tx_next_wr(struct iser_tx_desc *tx_desc)
return cur_wr;
}
+static inline struct iser_conn *
+to_iser_conn(struct ib_conn *ib_conn)
+{
+ return container_of(ib_conn, struct iser_conn, ib_conn);
+}
+
+static inline struct iser_rx_desc *
+iser_rx(struct ib_cqe *cqe)
+{
+ return container_of(cqe, struct iser_rx_desc, cqe);
+}
+
+static inline struct iser_tx_desc *
+iser_tx(struct ib_cqe *cqe)
+{
+ return container_of(cqe, struct iser_tx_desc, cqe);
+}
+
+static inline struct iser_login_desc *
+iser_login(struct ib_cqe *cqe)
+{
+ return container_of(cqe, struct iser_login_desc, cqe);
+}
+
#endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index ffd00c4..ed54b38 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -51,7 +51,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_mem_reg *mem_reg;
int err;
- struct iser_hdr *hdr = &iser_task->desc.iser_header;
+ struct iser_ctrl *hdr = &iser_task->desc.iser_header;
struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
err = iser_dma_map_task_data(iser_task,
@@ -72,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
return err;
}
- err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
+ err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN, false);
if (err) {
iser_err("Failed to set up Data-IN RDMA\n");
return err;
@@ -104,7 +104,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_mem_reg *mem_reg;
int err;
- struct iser_hdr *hdr = &iser_task->desc.iser_header;
+ struct iser_ctrl *hdr = &iser_task->desc.iser_header;
struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
@@ -126,7 +126,8 @@ iser_prepare_write_cmd(struct iscsi_task *task,
return err;
}
- err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
+ err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT,
+ buf_out->data_len == imm_sz);
if (err != 0) {
iser_err("Failed to register write cmd RDMA mem\n");
return err;
@@ -166,7 +167,7 @@ static void iser_create_send_desc(struct iser_conn *iser_conn,
ib_dma_sync_single_for_cpu(device->ib_device,
tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
- memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+ memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
tx_desc->iser_header.flags = ISER_VER;
tx_desc->num_sge = 1;
}
@@ -174,73 +175,63 @@ static void iser_create_send_desc(struct iser_conn *iser_conn,
static void iser_free_login_buf(struct iser_conn *iser_conn)
{
struct iser_device *device = iser_conn->ib_conn.device;
+ struct iser_login_desc *desc = &iser_conn->login_desc;
- if (!iser_conn->login_buf)
+ if (!desc->req)
return;
- if (iser_conn->login_req_dma)
- ib_dma_unmap_single(device->ib_device,
- iser_conn->login_req_dma,
- ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+ ib_dma_unmap_single(device->ib_device, desc->req_dma,
+ ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
- if (iser_conn->login_resp_dma)
- ib_dma_unmap_single(device->ib_device,
- iser_conn->login_resp_dma,
- ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+ ib_dma_unmap_single(device->ib_device, desc->rsp_dma,
+ ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
- kfree(iser_conn->login_buf);
+ kfree(desc->req);
+ kfree(desc->rsp);
/* make sure we never redo any unmapping */
- iser_conn->login_req_dma = 0;
- iser_conn->login_resp_dma = 0;
- iser_conn->login_buf = NULL;
+ desc->req = NULL;
+ desc->rsp = NULL;
}
static int iser_alloc_login_buf(struct iser_conn *iser_conn)
{
struct iser_device *device = iser_conn->ib_conn.device;
- int req_err, resp_err;
-
- BUG_ON(device == NULL);
-
- iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
- ISER_RX_LOGIN_SIZE, GFP_KERNEL);
- if (!iser_conn->login_buf)
- goto out_err;
-
- iser_conn->login_req_buf = iser_conn->login_buf;
- iser_conn->login_resp_buf = iser_conn->login_buf +
- ISCSI_DEF_MAX_RECV_SEG_LEN;
-
- iser_conn->login_req_dma = ib_dma_map_single(device->ib_device,
- iser_conn->login_req_buf,
- ISCSI_DEF_MAX_RECV_SEG_LEN,
- DMA_TO_DEVICE);
-
- iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device,
- iser_conn->login_resp_buf,
- ISER_RX_LOGIN_SIZE,
- DMA_FROM_DEVICE);
-
- req_err = ib_dma_mapping_error(device->ib_device,
- iser_conn->login_req_dma);
- resp_err = ib_dma_mapping_error(device->ib_device,
- iser_conn->login_resp_dma);
-
- if (req_err || resp_err) {
- if (req_err)
- iser_conn->login_req_dma = 0;
- if (resp_err)
- iser_conn->login_resp_dma = 0;
- goto free_login_buf;
- }
+ struct iser_login_desc *desc = &iser_conn->login_desc;
+
+ desc->req = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN, GFP_KERNEL);
+ if (!desc->req)
+ return -ENOMEM;
+
+ desc->req_dma = ib_dma_map_single(device->ib_device, desc->req,
+ ISCSI_DEF_MAX_RECV_SEG_LEN,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device->ib_device,
+ desc->req_dma))
+ goto free_req;
+
+ desc->rsp = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+ if (!desc->rsp)
+ goto unmap_req;
+
+ desc->rsp_dma = ib_dma_map_single(device->ib_device, desc->rsp,
+ ISER_RX_LOGIN_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(device->ib_device,
+ desc->rsp_dma))
+ goto free_rsp;
+
return 0;
-free_login_buf:
- iser_free_login_buf(iser_conn);
+free_rsp:
+ kfree(desc->rsp);
+unmap_req:
+ ib_dma_unmap_single(device->ib_device, desc->req_dma,
+ ISCSI_DEF_MAX_RECV_SEG_LEN,
+ DMA_TO_DEVICE);
+free_req:
+ kfree(desc->req);
-out_err:
- iser_err("unable to alloc or map login buf\n");
return -ENOMEM;
}
@@ -280,11 +271,11 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
goto rx_desc_dma_map_failed;
rx_desc->dma_addr = dma_addr;
-
+ rx_desc->cqe.done = iser_task_rsp;
rx_sg = &rx_desc->rx_sg;
- rx_sg->addr = rx_desc->dma_addr;
+ rx_sg->addr = rx_desc->dma_addr;
rx_sg->length = ISER_RX_PAYLOAD_SIZE;
- rx_sg->lkey = device->pd->local_dma_lkey;
+ rx_sg->lkey = device->pd->local_dma_lkey;
}
iser_conn->rx_desc_head = 0;
@@ -383,6 +374,7 @@ int iser_send_command(struct iscsi_conn *conn,
/* build the tx desc regd header and add it to the tx desc dto */
tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+ tx_desc->cqe.done = iser_cmd_comp;
iser_create_send_desc(iser_conn, tx_desc);
if (hdr->flags & ISCSI_FLAG_CMD_READ) {
@@ -464,6 +456,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
}
tx_desc->type = ISCSI_TX_DATAOUT;
+ tx_desc->cqe.done = iser_dataout_comp;
tx_desc->iser_header.flags = ISER_VER;
memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
@@ -513,6 +506,7 @@ int iser_send_control(struct iscsi_conn *conn,
/* build the tx desc regd header and add it to the tx desc dto */
mdesc->type = ISCSI_TX_CONTROL;
+ mdesc->cqe.done = iser_ctrl_comp;
iser_create_send_desc(iser_conn, mdesc);
device = iser_conn->ib_conn.device;
@@ -520,25 +514,25 @@ int iser_send_control(struct iscsi_conn *conn,
data_seg_len = ntoh24(task->hdr->dlength);
if (data_seg_len > 0) {
+ struct iser_login_desc *desc = &iser_conn->login_desc;
struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+
if (task != conn->login_task) {
iser_err("data present on non login task!!!\n");
goto send_control_error;
}
- ib_dma_sync_single_for_cpu(device->ib_device,
- iser_conn->login_req_dma, task->data_count,
- DMA_TO_DEVICE);
+ ib_dma_sync_single_for_cpu(device->ib_device, desc->req_dma,
+ task->data_count, DMA_TO_DEVICE);
- memcpy(iser_conn->login_req_buf, task->data, task->data_count);
+ memcpy(desc->req, task->data, task->data_count);
- ib_dma_sync_single_for_device(device->ib_device,
- iser_conn->login_req_dma, task->data_count,
- DMA_TO_DEVICE);
+ ib_dma_sync_single_for_device(device->ib_device, desc->req_dma,
+ task->data_count, DMA_TO_DEVICE);
- tx_dsg->addr = iser_conn->login_req_dma;
- tx_dsg->length = task->data_count;
- tx_dsg->lkey = device->pd->local_dma_lkey;
+ tx_dsg->addr = desc->req_dma;
+ tx_dsg->length = task->data_count;
+ tx_dsg->lkey = device->pd->local_dma_lkey;
mdesc->num_sge = 2;
}
@@ -562,41 +556,126 @@ send_control_error:
return err;
}
-/**
- * iser_rcv_dto_completion - recv DTO completion
- */
-void iser_rcv_completion(struct iser_rx_desc *rx_desc,
- unsigned long rx_xfer_len,
- struct ib_conn *ib_conn)
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
{
- struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
- ib_conn);
+ struct ib_conn *ib_conn = wc->qp->qp_context;
+ struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+ struct iser_login_desc *desc = iser_login(wc->wr_cqe);
struct iscsi_hdr *hdr;
- u64 rx_dma;
- int rx_buflen, outstanding, count, err;
+ char *data;
+ int length;
- /* differentiate between login to all other PDUs */
- if ((char *)rx_desc == iser_conn->login_resp_buf) {
- rx_dma = iser_conn->login_resp_dma;
- rx_buflen = ISER_RX_LOGIN_SIZE;
- } else {
- rx_dma = rx_desc->dma_addr;
- rx_buflen = ISER_RX_PAYLOAD_SIZE;
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ iser_err_comp(wc, "login_rsp");
+ return;
+ }
+
+ ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+ desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+ DMA_FROM_DEVICE);
+
+ hdr = desc->rsp + sizeof(struct iser_ctrl);
+ data = desc->rsp + ISER_HEADERS_LEN;
+ length = wc->byte_len - ISER_HEADERS_LEN;
+
+ iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+ hdr->itt, length);
+
+ iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data, length);
+
+ ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+ desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+ DMA_FROM_DEVICE);
+
+ ib_conn->post_recv_buf_count--;
+}
+
+static inline void
+iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
+{
+ if (likely(rkey == desc->rsc.mr->rkey))
+ desc->rsc.mr_valid = 0;
+ else if (likely(rkey == desc->pi_ctx->sig_mr->rkey))
+ desc->pi_ctx->sig_mr_valid = 0;
+}
+
+static int
+iser_check_remote_inv(struct iser_conn *iser_conn,
+ struct ib_wc *wc,
+ struct iscsi_hdr *hdr)
+{
+ if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
+ struct iscsi_task *task;
+ u32 rkey = wc->ex.invalidate_rkey;
+
+ iser_dbg("conn %p: remote invalidation for rkey %#x\n",
+ iser_conn, rkey);
+
+ if (unlikely(!iser_conn->snd_w_inv)) {
+ iser_err("conn %p: unexepected remote invalidation, "
+ "terminating connection\n", iser_conn);
+ return -EPROTO;
+ }
+
+ task = iscsi_itt_to_ctask(iser_conn->iscsi_conn, hdr->itt);
+ if (likely(task)) {
+ struct iscsi_iser_task *iser_task = task->dd_data;
+ struct iser_fr_desc *desc;
+
+ if (iser_task->dir[ISER_DIR_IN]) {
+ desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h;
+ iser_inv_desc(desc, rkey);
+ }
+
+ if (iser_task->dir[ISER_DIR_OUT]) {
+ desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h;
+ iser_inv_desc(desc, rkey);
+ }
+ } else {
+ iser_err("failed to get task for itt=%d\n", hdr->itt);
+ return -EINVAL;
+ }
}
- ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
- rx_buflen, DMA_FROM_DEVICE);
+ return 0;
+}
- hdr = &rx_desc->iscsi_header;
+
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_conn *ib_conn = wc->qp->qp_context;
+ struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+ struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
+ struct iscsi_hdr *hdr;
+ int length;
+ int outstanding, count, err;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ iser_err_comp(wc, "task_rsp");
+ return;
+ }
+
+ ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+ desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+ DMA_FROM_DEVICE);
+
+ hdr = &desc->iscsi_header;
+ length = wc->byte_len - ISER_HEADERS_LEN;
iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
- hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
+ hdr->itt, length);
+
+ if (iser_check_remote_inv(iser_conn, wc, hdr)) {
+ iscsi_conn_failure(iser_conn->iscsi_conn,
+ ISCSI_ERR_CONN_FAILED);
+ return;
+ }
- iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data,
- rx_xfer_len - ISER_HEADERS_LEN);
+ iscsi_iser_recv(iser_conn->iscsi_conn, hdr, desc->data, length);
- ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
- rx_buflen, DMA_FROM_DEVICE);
+ ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+ desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+ DMA_FROM_DEVICE);
/* decrementing conn->post_recv_buf_count only --after-- freeing the *
* task eliminates the need to worry on tasks which are completed in *
@@ -604,9 +683,6 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
* for the posted rx bufs refcount to become zero handles everything */
ib_conn->post_recv_buf_count--;
- if (rx_dma == iser_conn->login_resp_dma)
- return;
-
outstanding = ib_conn->post_recv_buf_count;
if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
count = min(iser_conn->qp_max_recv_dtos - outstanding,
@@ -617,26 +693,47 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
}
}
-void iser_snd_completion(struct iser_tx_desc *tx_desc,
- struct ib_conn *ib_conn)
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)
{
+ if (unlikely(wc->status != IB_WC_SUCCESS))
+ iser_err_comp(wc, "command");
+}
+
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
struct iscsi_task *task;
- struct iser_device *device = ib_conn->device;
- if (tx_desc->type == ISCSI_TX_DATAOUT) {
- ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
- ISER_HEADERS_LEN, DMA_TO_DEVICE);
- kmem_cache_free(ig.desc_cache, tx_desc);
- tx_desc = NULL;
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ iser_err_comp(wc, "control");
+ return;
}
- if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) {
- /* this arithmetic is legal by libiscsi dd_data allocation */
- task = (void *) ((long)(void *)tx_desc -
- sizeof(struct iscsi_task));
- if (task->hdr->itt == RESERVED_ITT)
- iscsi_put_task(task);
- }
+ /* this arithmetic is legal by libiscsi dd_data allocation */
+ task = (void *)desc - sizeof(struct iscsi_task);
+ if (task->hdr->itt == RESERVED_ITT)
+ iscsi_put_task(task);
+}
+
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
+ struct ib_conn *ib_conn = wc->qp->qp_context;
+ struct iser_device *device = ib_conn->device;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS))
+ iser_err_comp(wc, "dataout");
+
+ ib_dma_unmap_single(device->ib_device, desc->dma_addr,
+ ISER_HEADERS_LEN, DMA_TO_DEVICE);
+ kmem_cache_free(ig.desc_cache, desc);
+}
+
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_conn *ib_conn = wc->qp->qp_context;
+
+ complete(&ib_conn->last_comp);
}
void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index ea765fb..9a391cc 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -49,7 +49,7 @@ int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
struct iser_reg_resources *rsc,
struct iser_mem_reg *mem_reg);
-static struct iser_reg_ops fastreg_ops = {
+static const struct iser_reg_ops fastreg_ops = {
.alloc_reg_res = iser_alloc_fastreg_pool,
.free_reg_res = iser_free_fastreg_pool,
.reg_mem = iser_fast_reg_mr,
@@ -58,7 +58,7 @@ static struct iser_reg_ops fastreg_ops = {
.reg_desc_put = iser_reg_desc_put_fr,
};
-static struct iser_reg_ops fmr_ops = {
+static const struct iser_reg_ops fmr_ops = {
.alloc_reg_res = iser_alloc_fmr_pool,
.free_reg_res = iser_free_fmr_pool,
.reg_mem = iser_fast_reg_fmr,
@@ -67,19 +67,24 @@ static struct iser_reg_ops fmr_ops = {
.reg_desc_put = iser_reg_desc_put_fmr,
};
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+ iser_err_comp(wc, "memreg");
+}
+
int iser_assign_reg_ops(struct iser_device *device)
{
- struct ib_device_attr *dev_attr = &device->dev_attr;
+ struct ib_device *ib_dev = device->ib_device;
/* Assign function handles - based on FMR support */
- if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
- device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+ if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr &&
+ ib_dev->map_phys_fmr && ib_dev->unmap_fmr) {
iser_info("FMR supported, using FMR for registration\n");
device->reg_ops = &fmr_ops;
- } else
- if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ } else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
iser_info("FastReg supported, using FastReg for registration\n");
device->reg_ops = &fastreg_ops;
+ device->remote_inv_sup = iser_always_reg;
} else {
iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
return -1;
@@ -131,67 +136,6 @@ iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
{
}
-#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0)
-
-/**
- * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
- * and returns the length of resulting physical address array (may be less than
- * the original due to possible compaction).
- *
- * we build a "page vec" under the assumption that the SG meets the RDMA
- * alignment requirements. Other then the first and last SG elements, all
- * the "internal" elements can be compacted into a list whose elements are
- * dma addresses of physical pages. The code supports also the weird case
- * where --few fragments of the same page-- are present in the SG as
- * consecutive elements. Also, it handles one entry SG.
- */
-
-static int iser_sg_to_page_vec(struct iser_data_buf *data,
- struct ib_device *ibdev, u64 *pages,
- int *offset, int *data_size)
-{
- struct scatterlist *sg, *sgl = data->sg;
- u64 start_addr, end_addr, page, chunk_start = 0;
- unsigned long total_sz = 0;
- unsigned int dma_len;
- int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
-
- /* compute the offset of first element */
- *offset = (u64) sgl[0].offset & ~MASK_4K;
-
- new_chunk = 1;
- cur_page = 0;
- for_each_sg(sgl, sg, data->dma_nents, i) {
- start_addr = ib_sg_dma_address(ibdev, sg);
- if (new_chunk)
- chunk_start = start_addr;
- dma_len = ib_sg_dma_len(ibdev, sg);
- end_addr = start_addr + dma_len;
- total_sz += dma_len;
-
- /* collect page fragments until aligned or end of SG list */
- if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
- new_chunk = 0;
- continue;
- }
- new_chunk = 1;
-
- /* address of the first page in the contiguous chunk;
- masking relevant for the very first SG entry,
- which might be unaligned */
- page = chunk_start & MASK_4K;
- do {
- pages[cur_page++] = page;
- page += SIZE_4K;
- } while (page < end_addr);
- }
-
- *data_size = total_sz;
- iser_dbg("page_vec->data_size:%d cur_page %d\n",
- *data_size, cur_page);
- return cur_page;
-}
-
static void iser_data_buf_dump(struct iser_data_buf *data,
struct ib_device *ibdev)
{
@@ -210,10 +154,10 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec)
{
int i;
- iser_err("page vec length %d data size %d\n",
- page_vec->length, page_vec->data_size);
- for (i = 0; i < page_vec->length; i++)
- iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
+ iser_err("page vec npages %d data length %d\n",
+ page_vec->npages, page_vec->fake_mr.length);
+ for (i = 0; i < page_vec->npages; i++)
+ iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]);
}
int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
@@ -251,7 +195,11 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
struct scatterlist *sg = mem->sg;
reg->sge.lkey = device->pd->local_dma_lkey;
- reg->rkey = device->mr->rkey;
+ /*
+ * FIXME: rework the registration code path to differentiate
+ * rkey/lkey use cases
+ */
+ reg->rkey = device->mr ? device->mr->rkey : 0;
reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
@@ -262,11 +210,16 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
return 0;
}
-/**
- * iser_reg_page_vec - Register physical memory
- *
- * returns: 0 on success, errno code on failure
- */
+static int iser_set_page(struct ib_mr *mr, u64 addr)
+{
+ struct iser_page_vec *page_vec =
+ container_of(mr, struct iser_page_vec, fake_mr);
+
+ page_vec->pages[page_vec->npages++] = addr;
+
+ return 0;
+}
+
static
int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
struct iser_data_buf *mem,
@@ -280,22 +233,19 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
struct ib_pool_fmr *fmr;
int ret, plen;
- plen = iser_sg_to_page_vec(mem, device->ib_device,
- page_vec->pages,
- &page_vec->offset,
- &page_vec->data_size);
- page_vec->length = plen;
- if (plen * SIZE_4K < page_vec->data_size) {
+ page_vec->npages = 0;
+ page_vec->fake_mr.page_size = SIZE_4K;
+ plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
+ mem->size, iser_set_page);
+ if (unlikely(plen < mem->size)) {
iser_err("page vec too short to hold this SG\n");
iser_data_buf_dump(mem, device->ib_device);
iser_dump_page_vec(page_vec);
return -EINVAL;
}
- fmr = ib_fmr_pool_map_phys(fmr_pool,
- page_vec->pages,
- page_vec->length,
- page_vec->pages[0]);
+ fmr = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages,
+ page_vec->npages, page_vec->pages[0]);
if (IS_ERR(fmr)) {
ret = PTR_ERR(fmr);
iser_err("ib_fmr_pool_map_phys failed: %d\n", ret);
@@ -304,8 +254,8 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
reg->sge.lkey = fmr->fmr->lkey;
reg->rkey = fmr->fmr->rkey;
- reg->sge.addr = page_vec->pages[0] + page_vec->offset;
- reg->sge.length = page_vec->data_size;
+ reg->sge.addr = page_vec->fake_mr.iova;
+ reg->sge.length = page_vec->fake_mr.length;
reg->mem_h = fmr;
iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
@@ -413,19 +363,16 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
*mask |= ISER_CHECK_GUARD;
}
-static void
-iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
+static inline void
+iser_inv_rkey(struct ib_send_wr *inv_wr,
+ struct ib_mr *mr,
+ struct ib_cqe *cqe)
{
- u32 rkey;
-
inv_wr->opcode = IB_WR_LOCAL_INV;
- inv_wr->wr_id = ISER_FASTREG_LI_WRID;
+ inv_wr->wr_cqe = cqe;
inv_wr->ex.invalidate_rkey = mr->rkey;
inv_wr->send_flags = 0;
inv_wr->num_sge = 0;
-
- rkey = ib_inc_rkey(mr->rkey);
- ib_update_fast_reg_key(mr, rkey);
}
static int
@@ -437,7 +384,9 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
{
struct iser_tx_desc *tx_desc = &iser_task->desc;
struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
+ struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
struct ib_sig_handover_wr *wr;
+ struct ib_mr *mr = pi_ctx->sig_mr;
int ret;
memset(sig_attrs, 0, sizeof(*sig_attrs));
@@ -447,17 +396,19 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
- if (!pi_ctx->sig_mr_valid)
- iser_inv_rkey(iser_tx_next_wr(tx_desc), pi_ctx->sig_mr);
+ if (pi_ctx->sig_mr_valid)
+ iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+ ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
wr = sig_handover_wr(iser_tx_next_wr(tx_desc));
wr->wr.opcode = IB_WR_REG_SIG_MR;
- wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+ wr->wr.wr_cqe = cqe;
wr->wr.sg_list = &data_reg->sge;
wr->wr.num_sge = 1;
wr->wr.send_flags = 0;
wr->sig_attrs = sig_attrs;
- wr->sig_mr = pi_ctx->sig_mr;
+ wr->sig_mr = mr;
if (scsi_prot_sg_count(iser_task->sc))
wr->prot = &prot_reg->sge;
else
@@ -465,10 +416,10 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
wr->access_flags = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE;
- pi_ctx->sig_mr_valid = 0;
+ pi_ctx->sig_mr_valid = 1;
- sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
- sig_reg->rkey = pi_ctx->sig_mr->rkey;
+ sig_reg->sge.lkey = mr->lkey;
+ sig_reg->rkey = mr->rkey;
sig_reg->sge.addr = 0;
sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
@@ -485,12 +436,15 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
struct iser_mem_reg *reg)
{
struct iser_tx_desc *tx_desc = &iser_task->desc;
+ struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
struct ib_mr *mr = rsc->mr;
struct ib_reg_wr *wr;
int n;
- if (!rsc->mr_valid)
- iser_inv_rkey(iser_tx_next_wr(tx_desc), mr);
+ if (rsc->mr_valid)
+ iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+ ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K);
if (unlikely(n != mem->size)) {
@@ -501,7 +455,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
wr = reg_wr(iser_tx_next_wr(tx_desc));
wr->wr.opcode = IB_WR_REG_MR;
- wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+ wr->wr.wr_cqe = cqe;
wr->wr.send_flags = 0;
wr->wr.num_sge = 0;
wr->mr = mr;
@@ -510,7 +464,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ;
- rsc->mr_valid = 0;
+ rsc->mr_valid = 1;
reg->sge.lkey = mr->lkey;
reg->rkey = mr->rkey;
@@ -554,7 +508,8 @@ iser_reg_data_sg(struct iscsi_iser_task *task,
}
int iser_reg_rdma_mem(struct iscsi_iser_task *task,
- enum iser_data_dir dir)
+ enum iser_data_dir dir,
+ bool all_imm)
{
struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
struct iser_device *device = ib_conn->device;
@@ -565,8 +520,8 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
bool use_dma_key;
int err;
- use_dma_key = (mem->dma_nents == 1 && !iser_always_reg &&
- scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL);
+ use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) &&
+ scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL;
if (!use_dma_key) {
desc = device->reg_ops->reg_desc_get(ib_conn);
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 42f4da6..40c0f49 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -44,17 +44,6 @@
#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
ISCSI_ISER_MAX_CONN)
-static int iser_cq_poll_limit = 512;
-
-static void iser_cq_tasklet_fn(unsigned long data);
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
-
-static void iser_cq_event_callback(struct ib_event *cause, void *context)
-{
- iser_err("cq event %s (%d)\n",
- ib_event_msg(cause->event), cause->event);
-}
-
static void iser_qp_event_callback(struct ib_event *cause, void *context)
{
iser_err("qp event %s (%d)\n",
@@ -78,59 +67,40 @@ static void iser_event_handler(struct ib_event_handler *handler,
*/
static int iser_create_device_ib_res(struct iser_device *device)
{
- struct ib_device_attr *dev_attr = &device->dev_attr;
+ struct ib_device *ib_dev = device->ib_device;
int ret, i, max_cqe;
- ret = ib_query_device(device->ib_device, dev_attr);
- if (ret) {
- pr_warn("Query device failed for %s\n", device->ib_device->name);
- return ret;
- }
-
ret = iser_assign_reg_ops(device);
if (ret)
return ret;
device->comps_used = min_t(int, num_online_cpus(),
- device->ib_device->num_comp_vectors);
+ ib_dev->num_comp_vectors);
device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
GFP_KERNEL);
if (!device->comps)
goto comps_err;
- max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
+ max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
- device->comps_used, device->ib_device->name,
- device->ib_device->num_comp_vectors, max_cqe);
+ device->comps_used, ib_dev->name,
+ ib_dev->num_comp_vectors, max_cqe);
- device->pd = ib_alloc_pd(device->ib_device);
+ device->pd = ib_alloc_pd(ib_dev);
if (IS_ERR(device->pd))
goto pd_err;
for (i = 0; i < device->comps_used; i++) {
- struct ib_cq_init_attr cq_attr = {};
struct iser_comp *comp = &device->comps[i];
- comp->device = device;
- cq_attr.cqe = max_cqe;
- cq_attr.comp_vector = i;
- comp->cq = ib_create_cq(device->ib_device,
- iser_cq_callback,
- iser_cq_event_callback,
- (void *)comp,
- &cq_attr);
+ comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i,
+ IB_POLL_SOFTIRQ);
if (IS_ERR(comp->cq)) {
comp->cq = NULL;
goto cq_err;
}
-
- if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
- goto cq_err;
-
- tasklet_init(&comp->tasklet, iser_cq_tasklet_fn,
- (unsigned long)comp);
}
if (!iser_always_reg) {
@@ -140,11 +110,11 @@ static int iser_create_device_ib_res(struct iser_device *device)
device->mr = ib_get_dma_mr(device->pd, access);
if (IS_ERR(device->mr))
- goto dma_mr_err;
+ goto cq_err;
}
- INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
- iser_event_handler);
+ INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev,
+ iser_event_handler);
if (ib_register_event_handler(&device->event_handler))
goto handler_err;
@@ -153,15 +123,12 @@ static int iser_create_device_ib_res(struct iser_device *device)
handler_err:
if (device->mr)
ib_dereg_mr(device->mr);
-dma_mr_err:
- for (i = 0; i < device->comps_used; i++)
- tasklet_kill(&device->comps[i].tasklet);
cq_err:
for (i = 0; i < device->comps_used; i++) {
struct iser_comp *comp = &device->comps[i];
if (comp->cq)
- ib_destroy_cq(comp->cq);
+ ib_free_cq(comp->cq);
}
ib_dealloc_pd(device->pd);
pd_err:
@@ -182,8 +149,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
for (i = 0; i < device->comps_used; i++) {
struct iser_comp *comp = &device->comps[i];
- tasklet_kill(&comp->tasklet);
- ib_destroy_cq(comp->cq);
+ ib_free_cq(comp->cq);
comp->cq = NULL;
}
@@ -299,7 +265,7 @@ iser_alloc_reg_res(struct ib_device *ib_device,
iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
return ret;
}
- res->mr_valid = 1;
+ res->mr_valid = 0;
return 0;
}
@@ -336,7 +302,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device,
ret = PTR_ERR(pi_ctx->sig_mr);
goto sig_mr_failure;
}
- pi_ctx->sig_mr_valid = 1;
+ pi_ctx->sig_mr_valid = 0;
desc->pi_ctx->sig_protected = 0;
return 0;
@@ -461,10 +427,9 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
*/
static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
{
- struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
- ib_conn);
+ struct iser_conn *iser_conn = to_iser_conn(ib_conn);
struct iser_device *device;
- struct ib_device_attr *dev_attr;
+ struct ib_device *ib_dev;
struct ib_qp_init_attr init_attr;
int ret = -ENOMEM;
int index, min_index = 0;
@@ -472,7 +437,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
BUG_ON(ib_conn->device == NULL);
device = ib_conn->device;
- dev_attr = &device->dev_attr;
+ ib_dev = device->ib_device;
memset(&init_attr, 0, sizeof init_attr);
@@ -503,16 +468,16 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
iser_conn->max_cmds =
ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
} else {
- if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
+ if (ib_dev->attrs.max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
iser_conn->max_cmds =
ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
} else {
- init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
+ init_attr.cap.max_send_wr = ib_dev->attrs.max_qp_wr;
iser_conn->max_cmds =
- ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
+ ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr);
iser_dbg("device %s supports max_send_wr %d\n",
- device->ib_device->name, dev_attr->max_qp_wr);
+ device->ib_device->name, ib_dev->attrs.max_qp_wr);
}
}
@@ -724,13 +689,13 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
iser_conn, err);
/* post an indication that all flush errors were consumed */
- err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
+ err = ib_post_send(ib_conn->qp, &ib_conn->last, &bad_wr);
if (err) {
- iser_err("conn %p failed to post beacon", ib_conn);
+ iser_err("conn %p failed to post last wr", ib_conn);
return 1;
}
- wait_for_completion(&ib_conn->flush_comp);
+ wait_for_completion(&ib_conn->last_comp);
}
return 1;
@@ -756,7 +721,7 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE,
- device->dev_attr.max_fast_reg_page_list_len);
+ device->ib_device->attrs.max_fast_reg_page_list_len);
if (sg_tablesize > sup_sg_tablesize) {
sg_tablesize = sup_sg_tablesize;
@@ -799,7 +764,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
/* connection T10-PI support */
if (iser_pi_enable) {
- if (!(device->dev_attr.device_cap_flags &
+ if (!(device->ib_device->attrs.device_cap_flags &
IB_DEVICE_SIGNATURE_HANDOVER)) {
iser_warn("T10-PI requested but not supported on %s, "
"continue without T10-PI\n",
@@ -841,16 +806,17 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
goto failure;
memset(&conn_param, 0, sizeof conn_param);
- conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
+ conn_param.responder_resources = device->ib_device->attrs.max_qp_rd_atom;
conn_param.initiator_depth = 1;
conn_param.retry_count = 7;
conn_param.rnr_retry_count = 6;
memset(&req_hdr, 0, sizeof(req_hdr));
- req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
- ISER_SEND_W_INV_NOT_SUPPORTED);
- conn_param.private_data = (void *)&req_hdr;
- conn_param.private_data_len = sizeof(struct iser_cm_hdr);
+ req_hdr.flags = ISER_ZBVA_NOT_SUP;
+ if (!device->remote_inv_sup)
+ req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP;
+ conn_param.private_data = (void *)&req_hdr;
+ conn_param.private_data_len = sizeof(struct iser_cm_hdr);
ret = rdma_connect(cma_id, &conn_param);
if (ret) {
@@ -863,7 +829,8 @@ failure:
iser_connect_error(cma_id);
}
-static void iser_connected_handler(struct rdma_cm_id *cma_id)
+static void iser_connected_handler(struct rdma_cm_id *cma_id,
+ const void *private_data)
{
struct iser_conn *iser_conn;
struct ib_qp_attr attr;
@@ -877,6 +844,15 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);
+ if (private_data) {
+ u8 flags = *(u8 *)private_data;
+
+ iser_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP);
+ }
+
+ iser_info("conn %p: negotiated %s invalidation\n",
+ iser_conn, iser_conn->snd_w_inv ? "remote" : "local");
+
iser_conn->state = ISER_CONN_UP;
complete(&iser_conn->up_completion);
}
@@ -928,7 +904,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
iser_route_handler(cma_id);
break;
case RDMA_CM_EVENT_ESTABLISHED:
- iser_connected_handler(cma_id);
+ iser_connected_handler(cma_id, event->param.conn.private_data);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
@@ -967,14 +943,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
void iser_conn_init(struct iser_conn *iser_conn)
{
+ struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
iser_conn->state = ISER_CONN_INIT;
- iser_conn->ib_conn.post_recv_buf_count = 0;
- init_completion(&iser_conn->ib_conn.flush_comp);
init_completion(&iser_conn->stop_completion);
init_completion(&iser_conn->ib_completion);
init_completion(&iser_conn->up_completion);
INIT_LIST_HEAD(&iser_conn->conn_list);
mutex_init(&iser_conn->state_mutex);
+
+ ib_conn->post_recv_buf_count = 0;
+ ib_conn->reg_cqe.done = iser_reg_comp;
+ ib_conn->last_cqe.done = iser_last_comp;
+ ib_conn->last.wr_cqe = &ib_conn->last_cqe;
+ ib_conn->last.opcode = IB_WR_SEND;
+ init_completion(&ib_conn->last_comp);
}
/**
@@ -1000,9 +983,6 @@ int iser_connect(struct iser_conn *iser_conn,
iser_conn->state = ISER_CONN_PENDING;
- ib_conn->beacon.wr_id = ISER_BEACON_WRID;
- ib_conn->beacon.opcode = IB_WR_SEND;
-
ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler,
(void *)iser_conn,
RDMA_PS_TCP, IB_QPT_RC);
@@ -1045,56 +1025,60 @@ connect_failure:
int iser_post_recvl(struct iser_conn *iser_conn)
{
- struct ib_recv_wr rx_wr, *rx_wr_failed;
struct ib_conn *ib_conn = &iser_conn->ib_conn;
- struct ib_sge sge;
+ struct iser_login_desc *desc = &iser_conn->login_desc;
+ struct ib_recv_wr wr, *wr_failed;
int ib_ret;
- sge.addr = iser_conn->login_resp_dma;
- sge.length = ISER_RX_LOGIN_SIZE;
- sge.lkey = ib_conn->device->pd->local_dma_lkey;
+ desc->sge.addr = desc->rsp_dma;
+ desc->sge.length = ISER_RX_LOGIN_SIZE;
+ desc->sge.lkey = ib_conn->device->pd->local_dma_lkey;
- rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf;
- rx_wr.sg_list = &sge;
- rx_wr.num_sge = 1;
- rx_wr.next = NULL;
+ desc->cqe.done = iser_login_rsp;
+ wr.wr_cqe = &desc->cqe;
+ wr.sg_list = &desc->sge;
+ wr.num_sge = 1;
+ wr.next = NULL;
ib_conn->post_recv_buf_count++;
- ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+ ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed);
if (ib_ret) {
iser_err("ib_post_recv failed ret=%d\n", ib_ret);
ib_conn->post_recv_buf_count--;
}
+
return ib_ret;
}
int iser_post_recvm(struct iser_conn *iser_conn, int count)
{
- struct ib_recv_wr *rx_wr, *rx_wr_failed;
- int i, ib_ret;
struct ib_conn *ib_conn = &iser_conn->ib_conn;
unsigned int my_rx_head = iser_conn->rx_desc_head;
struct iser_rx_desc *rx_desc;
+ struct ib_recv_wr *wr, *wr_failed;
+ int i, ib_ret;
- for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
- rx_desc = &iser_conn->rx_descs[my_rx_head];
- rx_wr->wr_id = (uintptr_t)rx_desc;
- rx_wr->sg_list = &rx_desc->rx_sg;
- rx_wr->num_sge = 1;
- rx_wr->next = rx_wr + 1;
+ for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
+ rx_desc = &iser_conn->rx_descs[my_rx_head];
+ rx_desc->cqe.done = iser_task_rsp;
+ wr->wr_cqe = &rx_desc->cqe;
+ wr->sg_list = &rx_desc->rx_sg;
+ wr->num_sge = 1;
+ wr->next = wr + 1;
my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
}
- rx_wr--;
- rx_wr->next = NULL; /* mark end of work requests list */
+ wr--;
+ wr->next = NULL; /* mark end of work requests list */
ib_conn->post_recv_buf_count += count;
- ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
+ ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed);
if (ib_ret) {
iser_err("ib_post_recv failed ret=%d\n", ib_ret);
ib_conn->post_recv_buf_count -= count;
} else
iser_conn->rx_desc_head = my_rx_head;
+
return ib_ret;
}
@@ -1115,7 +1099,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
DMA_TO_DEVICE);
wr->next = NULL;
- wr->wr_id = (uintptr_t)tx_desc;
+ wr->wr_cqe = &tx_desc->cqe;
wr->sg_list = tx_desc->tx_sg;
wr->num_sge = tx_desc->num_sge;
wr->opcode = IB_WR_SEND;
@@ -1129,149 +1113,6 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
return ib_ret;
}
-/**
- * is_iser_tx_desc - Indicate if the completion wr_id
- * is a TX descriptor or not.
- * @iser_conn: iser connection
- * @wr_id: completion WR identifier
- *
- * Since we cannot rely on wc opcode in FLUSH errors
- * we must work around it by checking if the wr_id address
- * falls in the iser connection rx_descs buffer. If so
- * it is an RX descriptor, otherwize it is a TX.
- */
-static inline bool
-is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
-{
- void *start = iser_conn->rx_descs;
- int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
-
- if (wr_id >= start && wr_id < start + len)
- return false;
-
- return true;
-}
-
-/**
- * iser_handle_comp_error() - Handle error completion
- * @ib_conn: connection RDMA resources
- * @wc: work completion
- *
- * Notes: We may handle a FLUSH error completion and in this case
- * we only cleanup in case TX type was DATAOUT. For non-FLUSH
- * error completion we should also notify iscsi layer that
- * connection is failed (in case we passed bind stage).
- */
-static void
-iser_handle_comp_error(struct ib_conn *ib_conn,
- struct ib_wc *wc)
-{
- void *wr_id = (void *)(uintptr_t)wc->wr_id;
- struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
- ib_conn);
-
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- if (iser_conn->iscsi_conn)
- iscsi_conn_failure(iser_conn->iscsi_conn,
- ISCSI_ERR_CONN_FAILED);
-
- if (wc->wr_id == ISER_FASTREG_LI_WRID)
- return;
-
- if (is_iser_tx_desc(iser_conn, wr_id)) {
- struct iser_tx_desc *desc = wr_id;
-
- if (desc->type == ISCSI_TX_DATAOUT)
- kmem_cache_free(ig.desc_cache, desc);
- } else {
- ib_conn->post_recv_buf_count--;
- }
-}
-
-/**
- * iser_handle_wc - handle a single work completion
- * @wc: work completion
- *
- * Soft-IRQ context, work completion can be either
- * SEND or RECV, and can turn out successful or
- * with error (or flush error).
- */
-static void iser_handle_wc(struct ib_wc *wc)
-{
- struct ib_conn *ib_conn;
- struct iser_tx_desc *tx_desc;
- struct iser_rx_desc *rx_desc;
-
- ib_conn = wc->qp->qp_context;
- if (likely(wc->status == IB_WC_SUCCESS)) {
- if (wc->opcode == IB_WC_RECV) {
- rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
- iser_rcv_completion(rx_desc, wc->byte_len,
- ib_conn);
- } else
- if (wc->opcode == IB_WC_SEND) {
- tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
- iser_snd_completion(tx_desc, ib_conn);
- } else {
- iser_err("Unknown wc opcode %d\n", wc->opcode);
- }
- } else {
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- iser_err("%s (%d): wr id %llx vend_err %x\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->wr_id, wc->vendor_err);
- else
- iser_dbg("%s (%d): wr id %llx\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->wr_id);
-
- if (wc->wr_id == ISER_BEACON_WRID)
- /* all flush errors were consumed */
- complete(&ib_conn->flush_comp);
- else
- iser_handle_comp_error(ib_conn, wc);
- }
-}
-
-/**
- * iser_cq_tasklet_fn - iSER completion polling loop
- * @data: iSER completion context
- *
- * Soft-IRQ context, polling connection CQ until
- * either CQ was empty or we exausted polling budget
- */
-static void iser_cq_tasklet_fn(unsigned long data)
-{
- struct iser_comp *comp = (struct iser_comp *)data;
- struct ib_cq *cq = comp->cq;
- struct ib_wc *const wcs = comp->wcs;
- int i, n, completed = 0;
-
- while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
- for (i = 0; i < n; i++)
- iser_handle_wc(&wcs[i]);
-
- completed += n;
- if (completed >= iser_cq_poll_limit)
- break;
- }
-
- /*
- * It is assumed here that arming CQ only once its empty
- * would not cause interrupts to be missed.
- */
- ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-
- iser_dbg("got %d completions\n", completed);
-}
-
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
-{
- struct iser_comp *comp = cq_context;
-
- tasklet_schedule(&comp->tasklet);
-}
-
u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir, sector_t *sector)
{
@@ -1319,3 +1160,21 @@ err:
/* Not alot we can do here, return ambiguous guard error */
return 0x1;
}
+
+void iser_err_comp(struct ib_wc *wc, const char *type)
+{
+ if (wc->status != IB_WC_WR_FLUSH_ERR) {
+ struct iser_conn *iser_conn = to_iser_conn(wc->qp->qp_context);
+
+ iser_err("%s failure: %s (%d) vend_err %x\n", type,
+ ib_wc_status_msg(wc->status), wc->status,
+ wc->vendor_err);
+
+ if (iser_conn->iscsi_conn)
+ iscsi_conn_failure(iser_conn->iscsi_conn,
+ ISCSI_ERR_CONN_FAILED);
+ } else {
+ iser_dbg("%s failure: %s (%d)\n", type,
+ ib_wc_status_msg(wc->status), wc->status);
+ }
+}
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 468c5e1..f121e61 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -29,7 +29,6 @@
#include <target/iscsi/iscsi_transport.h>
#include <linux/semaphore.h>
-#include "isert_proto.h"
#include "ib_isert.h"
#define ISERT_MAX_CONN 8
@@ -95,22 +94,6 @@ isert_qp_event_callback(struct ib_event *e, void *context)
}
}
-static int
-isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr)
-{
- int ret;
-
- ret = ib_query_device(ib_dev, devattr);
- if (ret) {
- isert_err("ib_query_device() failed: %d\n", ret);
- return ret;
- }
- isert_dbg("devattr->max_sge: %d\n", devattr->max_sge);
- isert_dbg("devattr->max_sge_rd: %d\n", devattr->max_sge_rd);
-
- return 0;
-}
-
static struct isert_comp *
isert_comp_get(struct isert_conn *isert_conn)
{
@@ -157,9 +140,9 @@ isert_create_qp(struct isert_conn *isert_conn,
attr.recv_cq = comp->cq;
attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS;
attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
- attr.cap.max_send_sge = device->dev_attr.max_sge;
- isert_conn->max_sge = min(device->dev_attr.max_sge,
- device->dev_attr.max_sge_rd);
+ attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
+ isert_conn->max_sge = min(device->ib_device->attrs.max_sge,
+ device->ib_device->attrs.max_sge_rd);
attr.cap.max_recv_sge = 1;
attr.sq_sig_type = IB_SIGNAL_REQ_WR;
attr.qp_type = IB_QPT_RC;
@@ -287,8 +270,7 @@ isert_free_comps(struct isert_device *device)
}
static int
-isert_alloc_comps(struct isert_device *device,
- struct ib_device_attr *attr)
+isert_alloc_comps(struct isert_device *device)
{
int i, max_cqe, ret = 0;
@@ -308,7 +290,7 @@ isert_alloc_comps(struct isert_device *device,
return -ENOMEM;
}
- max_cqe = min(ISER_MAX_CQ_LEN, attr->max_cqe);
+ max_cqe = min(ISER_MAX_CQ_LEN, device->ib_device->attrs.max_cqe);
for (i = 0; i < device->comps_used; i++) {
struct ib_cq_init_attr cq_attr = {};
@@ -344,17 +326,15 @@ out_cq:
static int
isert_create_device_ib_res(struct isert_device *device)
{
- struct ib_device_attr *dev_attr;
+ struct ib_device *ib_dev = device->ib_device;
int ret;
- dev_attr = &device->dev_attr;
- ret = isert_query_device(device->ib_device, dev_attr);
- if (ret)
- goto out;
+ isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge);
+ isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
/* asign function handlers */
- if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
- dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
+ if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
+ ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
device->use_fastreg = 1;
device->reg_rdma_mem = isert_reg_rdma;
device->unreg_rdma_mem = isert_unreg_rdma;
@@ -364,11 +344,11 @@ isert_create_device_ib_res(struct isert_device *device)
device->unreg_rdma_mem = isert_unmap_cmd;
}
- ret = isert_alloc_comps(device, dev_attr);
+ ret = isert_alloc_comps(device);
if (ret)
goto out;
- device->pd = ib_alloc_pd(device->ib_device);
+ device->pd = ib_alloc_pd(ib_dev);
if (IS_ERR(device->pd)) {
ret = PTR_ERR(device->pd);
isert_err("failed to allocate pd, device %p, ret=%d\n",
@@ -377,7 +357,7 @@ isert_create_device_ib_res(struct isert_device *device)
}
/* Check signature cap */
- device->pi_capable = dev_attr->device_cap_flags &
+ device->pi_capable = ib_dev->attrs.device_cap_flags &
IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
return 0;
@@ -676,6 +656,32 @@ out_login_buf:
return ret;
}
+static void
+isert_set_nego_params(struct isert_conn *isert_conn,
+ struct rdma_conn_param *param)
+{
+ struct ib_device_attr *attr = &isert_conn->device->ib_device->attrs;
+
+ /* Set max inflight RDMA READ requests */
+ isert_conn->initiator_depth = min_t(u8, param->initiator_depth,
+ attr->max_qp_init_rd_atom);
+ isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+
+ if (param->private_data) {
+ u8 flags = *(u8 *)param->private_data;
+
+ /*
+ * use remote invalidation if the both initiator
+ * and the HCA support it
+ */
+ isert_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP) &&
+ (attr->device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS);
+ if (isert_conn->snd_w_inv)
+ isert_info("Using remote invalidation\n");
+ }
+}
+
static int
isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
{
@@ -714,11 +720,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
}
isert_conn->device = device;
- /* Set max inflight RDMA READ requests */
- isert_conn->initiator_depth = min_t(u8,
- event->param.conn.initiator_depth,
- device->dev_attr.max_qp_init_rd_atom);
- isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+ isert_set_nego_params(isert_conn, &event->param.conn);
ret = isert_conn_setup_qp(isert_conn, cma_id);
if (ret)
@@ -1050,8 +1052,8 @@ isert_create_send_desc(struct isert_conn *isert_conn,
ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr,
ISER_HEADERS_LEN, DMA_TO_DEVICE);
- memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
- tx_desc->iser_header.flags = ISER_VER;
+ memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
+ tx_desc->iser_header.flags = ISCSI_CTRL;
tx_desc->num_sge = 1;
tx_desc->isert_cmd = isert_cmd;
@@ -1097,7 +1099,14 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND;
send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc;
- send_wr->opcode = IB_WR_SEND;
+
+ if (isert_conn->snd_w_inv && isert_cmd->inv_rkey) {
+ send_wr->opcode = IB_WR_SEND_WITH_INV;
+ send_wr->ex.invalidate_rkey = isert_cmd->inv_rkey;
+ } else {
+ send_wr->opcode = IB_WR_SEND;
+ }
+
send_wr->sg_list = &tx_desc->tx_sg[0];
send_wr->num_sge = isert_cmd->tx_desc.num_sge;
send_wr->send_flags = IB_SEND_SIGNALED;
@@ -1486,6 +1495,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
isert_cmd->read_va = read_va;
isert_cmd->write_stag = write_stag;
isert_cmd->write_va = write_va;
+ isert_cmd->inv_rkey = read_stag ? read_stag : write_stag;
ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd,
rx_desc, (unsigned char *)hdr);
@@ -1543,21 +1553,21 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
static void
isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
{
- struct iser_hdr *iser_hdr = &rx_desc->iser_header;
+ struct iser_ctrl *iser_ctrl = &rx_desc->iser_header;
uint64_t read_va = 0, write_va = 0;
uint32_t read_stag = 0, write_stag = 0;
- switch (iser_hdr->flags & 0xF0) {
+ switch (iser_ctrl->flags & 0xF0) {
case ISCSI_CTRL:
- if (iser_hdr->flags & ISER_RSV) {
- read_stag = be32_to_cpu(iser_hdr->read_stag);
- read_va = be64_to_cpu(iser_hdr->read_va);
+ if (iser_ctrl->flags & ISER_RSV) {
+ read_stag = be32_to_cpu(iser_ctrl->read_stag);
+ read_va = be64_to_cpu(iser_ctrl->read_va);
isert_dbg("ISER_RSV: read_stag: 0x%x read_va: 0x%llx\n",
read_stag, (unsigned long long)read_va);
}
- if (iser_hdr->flags & ISER_WSV) {
- write_stag = be32_to_cpu(iser_hdr->write_stag);
- write_va = be64_to_cpu(iser_hdr->write_va);
+ if (iser_ctrl->flags & ISER_WSV) {
+ write_stag = be32_to_cpu(iser_ctrl->write_stag);
+ write_va = be64_to_cpu(iser_ctrl->write_va);
isert_dbg("ISER_WSV: write_stag: 0x%x write_va: 0x%llx\n",
write_stag, (unsigned long long)write_va);
}
@@ -1568,7 +1578,7 @@ isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
isert_err("iSER Hello message\n");
break;
default:
- isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags);
+ isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_ctrl->flags);
break;
}
@@ -3095,12 +3105,20 @@ isert_rdma_accept(struct isert_conn *isert_conn)
struct rdma_cm_id *cm_id = isert_conn->cm_id;
struct rdma_conn_param cp;
int ret;
+ struct iser_cm_hdr rsp_hdr;
memset(&cp, 0, sizeof(struct rdma_conn_param));
cp.initiator_depth = isert_conn->initiator_depth;
cp.retry_count = 7;
cp.rnr_retry_count = 7;
+ memset(&rsp_hdr, 0, sizeof(rsp_hdr));
+ rsp_hdr.flags = ISERT_ZBVA_NOT_USED;
+ if (!isert_conn->snd_w_inv)
+ rsp_hdr.flags = rsp_hdr.flags | ISERT_SEND_W_INV_NOT_USED;
+ cp.private_data = (void *)&rsp_hdr;
+ cp.private_data_len = sizeof(rsp_hdr);
+
ret = rdma_accept(cm_id, &cp);
if (ret) {
isert_err("rdma_accept() failed with: %d\n", ret);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 3d7fbc4..8d50453 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -3,6 +3,8 @@
#include <linux/in6.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
+#include <scsi/iser.h>
+
#define DRV_NAME "isert"
#define PFX DRV_NAME ": "
@@ -31,6 +33,38 @@
#define isert_err(fmt, arg...) \
pr_err(PFX "%s: " fmt, __func__ , ## arg)
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN (sizeof(struct iser_ctrl) + \
+ sizeof(struct iscsi_hdr))
+#define ISER_RECV_DATA_SEG_LEN 8192
+#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */
+
+#define ISERT_MAX_RX_MISC_PDUS 6 /*
+ * NOOP_OUT(2), TEXT(1),
+ * SCSI_TMFUNC(2), LOGOUT(1)
+ */
+
+#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
+
+#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
+
+#define ISERT_INFLIGHT_DATAOUTS 8
+
+#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \
+ (1 + ISERT_INFLIGHT_DATAOUTS) + \
+ ISERT_MAX_TX_MISC_PDUS + \
+ ISERT_MAX_RX_MISC_PDUS)
+
+#define ISER_RX_PAD_SIZE (ISER_RECV_DATA_SEG_LEN + 4096 - \
+ (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
+
#define ISCSI_ISER_SG_TABLESIZE 256
#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL
#define ISER_BEACON_WRID 0xfffffffffffffffeULL
@@ -56,7 +90,7 @@ enum iser_conn_state {
};
struct iser_rx_desc {
- struct iser_hdr iser_header;
+ struct iser_ctrl iser_header;
struct iscsi_hdr iscsi_header;
char data[ISER_RECV_DATA_SEG_LEN];
u64 dma_addr;
@@ -65,7 +99,7 @@ struct iser_rx_desc {
} __packed;
struct iser_tx_desc {
- struct iser_hdr iser_header;
+ struct iser_ctrl iser_header;
struct iscsi_hdr iscsi_header;
enum isert_desc_type type;
u64 dma_addr;
@@ -129,6 +163,7 @@ struct isert_cmd {
uint32_t write_stag;
uint64_t read_va;
uint64_t write_va;
+ uint32_t inv_rkey;
u64 pdu_buf_dma;
u32 pdu_buf_len;
struct isert_conn *conn;
@@ -176,6 +211,7 @@ struct isert_conn {
struct work_struct release_work;
struct ib_recv_wr beacon;
bool logout_posted;
+ bool snd_w_inv;
};
#define ISERT_MAX_CQ 64
@@ -207,7 +243,6 @@ struct isert_device {
struct isert_comp *comps;
int comps_used;
struct list_head dev_node;
- struct ib_device_attr dev_attr;
int (*reg_rdma_mem)(struct iscsi_conn *conn,
struct iscsi_cmd *cmd,
struct isert_rdma_wr *wr);
diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h
deleted file mode 100644
index 4dccd313..0000000
--- a/drivers/infiniband/ulp/isert/isert_proto.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* From iscsi_iser.h */
-
-struct iser_hdr {
- u8 flags;
- u8 rsvd[3];
- __be32 write_stag; /* write rkey */
- __be64 write_va;
- __be32 read_stag; /* read rkey */
- __be64 read_va;
-} __packed;
-
-/*Constant PDU lengths calculations */
-#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
-
-#define ISER_RECV_DATA_SEG_LEN 8192
-#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
-#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
-
-/* QP settings */
-/* Maximal bounds on received asynchronous PDUs */
-#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */
-
-#define ISERT_MAX_RX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), *
- * SCSI_TMFUNC(2), LOGOUT(1) */
-
-#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
-
-#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX)
-
-#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
-
-#define ISERT_INFLIGHT_DATAOUTS 8
-
-#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \
- (1 + ISERT_INFLIGHT_DATAOUTS) + \
- ISERT_MAX_TX_MISC_PDUS + \
- ISERT_MAX_RX_MISC_PDUS)
-
-#define ISER_RX_PAD_SIZE (ISER_RECV_DATA_SEG_LEN + 4096 - \
- (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
-
-#define ISER_VER 0x10
-#define ISER_WSV 0x08
-#define ISER_RSV 0x04
-#define ISCSI_CTRL 0x10
-#define ISER_HELLO 0x20
-#define ISER_HELLORPLY 0x30
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 3db9a65..03022f6 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -132,8 +132,9 @@ MODULE_PARM_DESC(ch_count,
static void srp_add_one(struct ib_device *device);
static void srp_remove_one(struct ib_device *device, void *client_data);
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+ const char *opname);
static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
static struct scsi_transport_template *ib_srp_transport_template;
@@ -445,6 +446,17 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
dev->max_pages_per_mr);
}
+static void srp_drain_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct srp_rdma_ch *ch = cq->cq_context;
+
+ complete(&ch->done);
+}
+
+static struct ib_cqe srp_drain_cqe = {
+ .done = srp_drain_done,
+};
+
/**
* srp_destroy_qp() - destroy an RDMA queue pair
* @ch: SRP RDMA channel.
@@ -457,10 +469,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
static void srp_destroy_qp(struct srp_rdma_ch *ch)
{
static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
- static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
+ static struct ib_recv_wr wr = { 0 };
struct ib_recv_wr *bad_wr;
int ret;
+ wr.wr_cqe = &srp_drain_cqe;
/* Destroying a QP and reusing ch->done is only safe if not connected */
WARN_ON_ONCE(ch->connected);
@@ -489,34 +502,27 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
struct ib_fmr_pool *fmr_pool = NULL;
struct srp_fr_pool *fr_pool = NULL;
const int m = dev->use_fast_reg ? 3 : 1;
- struct ib_cq_init_attr cq_attr = {};
int ret;
init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
if (!init_attr)
return -ENOMEM;
- /* + 1 for SRP_LAST_WR_ID */
- cq_attr.cqe = target->queue_size + 1;
- cq_attr.comp_vector = ch->comp_vector;
- recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, ch,
- &cq_attr);
+ /* queue_size + 1 for ib_drain_qp */
+ recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1,
+ ch->comp_vector, IB_POLL_SOFTIRQ);
if (IS_ERR(recv_cq)) {
ret = PTR_ERR(recv_cq);
goto err;
}
- cq_attr.cqe = m * target->queue_size;
- cq_attr.comp_vector = ch->comp_vector;
- send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, ch,
- &cq_attr);
+ send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
+ ch->comp_vector, IB_POLL_DIRECT);
if (IS_ERR(send_cq)) {
ret = PTR_ERR(send_cq);
goto err_recv_cq;
}
- ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
-
init_attr->event_handler = srp_qp_event;
init_attr->cap.max_send_wr = m * target->queue_size;
init_attr->cap.max_recv_wr = target->queue_size + 1;
@@ -558,9 +564,9 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
if (ch->qp)
srp_destroy_qp(ch);
if (ch->recv_cq)
- ib_destroy_cq(ch->recv_cq);
+ ib_free_cq(ch->recv_cq);
if (ch->send_cq)
- ib_destroy_cq(ch->send_cq);
+ ib_free_cq(ch->send_cq);
ch->qp = qp;
ch->recv_cq = recv_cq;
@@ -580,13 +586,13 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
return 0;
err_qp:
- ib_destroy_qp(qp);
+ srp_destroy_qp(ch);
err_send_cq:
- ib_destroy_cq(send_cq);
+ ib_free_cq(send_cq);
err_recv_cq:
- ib_destroy_cq(recv_cq);
+ ib_free_cq(recv_cq);
err:
kfree(init_attr);
@@ -622,9 +628,10 @@ static void srp_free_ch_ib(struct srp_target_port *target,
if (ch->fmr_pool)
ib_destroy_fmr_pool(ch->fmr_pool);
}
+
srp_destroy_qp(ch);
- ib_destroy_cq(ch->send_cq);
- ib_destroy_cq(ch->recv_cq);
+ ib_free_cq(ch->send_cq);
+ ib_free_cq(ch->recv_cq);
/*
* Avoid that the SCSI error handler tries to use this channel after
@@ -1041,18 +1048,25 @@ out:
return ret <= 0 ? ret : -ENODEV;
}
-static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
+static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ srp_handle_qp_err(cq, wc, "INV RKEY");
+}
+
+static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
+ u32 rkey)
{
struct ib_send_wr *bad_wr;
struct ib_send_wr wr = {
.opcode = IB_WR_LOCAL_INV,
- .wr_id = LOCAL_INV_WR_ID_MASK,
.next = NULL,
.num_sge = 0,
.send_flags = 0,
.ex.invalidate_rkey = rkey,
};
+ wr.wr_cqe = &req->reg_cqe;
+ req->reg_cqe.done = srp_inv_rkey_err_done;
return ib_post_send(ch->qp, &wr, &bad_wr);
}
@@ -1074,7 +1088,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
struct srp_fr_desc **pfr;
for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) {
- res = srp_inv_rkey(ch, (*pfr)->mr->rkey);
+ res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey);
if (res < 0) {
shost_printk(KERN_ERR, target->scsi_host, PFX
"Queueing INV WR for rkey %#x failed (%d)\n",
@@ -1312,7 +1326,13 @@ reset_state:
return 0;
}
+static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ srp_handle_qp_err(cq, wc, "FAST REG");
+}
+
static int srp_map_finish_fr(struct srp_map_state *state,
+ struct srp_request *req,
struct srp_rdma_ch *ch, int sg_nents)
{
struct srp_target_port *target = ch->target;
@@ -1349,9 +1369,11 @@ static int srp_map_finish_fr(struct srp_map_state *state,
if (unlikely(n < 0))
return n;
+ req->reg_cqe.done = srp_reg_mr_err_done;
+
wr.wr.next = NULL;
wr.wr.opcode = IB_WR_REG_MR;
- wr.wr.wr_id = FAST_REG_WR_ID_MASK;
+ wr.wr.wr_cqe = &req->reg_cqe;
wr.wr.num_sge = 0;
wr.wr.send_flags = 0;
wr.mr = desc->mr;
@@ -1455,7 +1477,7 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
while (count) {
int i, n;
- n = srp_map_finish_fr(state, ch, count);
+ n = srp_map_finish_fr(state, req, ch, count);
if (unlikely(n < 0))
return n;
@@ -1524,7 +1546,7 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
#ifdef CONFIG_NEED_SG_DMA_LENGTH
idb_sg->dma_length = idb_sg->length; /* hack^2 */
#endif
- ret = srp_map_finish_fr(&state, ch, 1);
+ ret = srp_map_finish_fr(&state, req, ch, 1);
if (ret < 0)
return ret;
} else if (dev->use_fmr) {
@@ -1719,7 +1741,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE;
struct srp_iu *iu;
- srp_send_completion(ch->send_cq, ch);
+ ib_process_cq_direct(ch->send_cq, -1);
if (list_empty(&ch->free_tx))
return NULL;
@@ -1739,6 +1761,19 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
return iu;
}
+static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+ struct srp_rdma_ch *ch = cq->cq_context;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ srp_handle_qp_err(cq, wc, "SEND");
+ return;
+ }
+
+ list_add(&iu->list, &ch->free_tx);
+}
+
static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
{
struct srp_target_port *target = ch->target;
@@ -1749,8 +1784,10 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
list.length = len;
list.lkey = target->lkey;
+ iu->cqe.done = srp_send_done;
+
wr.next = NULL;
- wr.wr_id = (uintptr_t) iu;
+ wr.wr_cqe = &iu->cqe;
wr.sg_list = &list;
wr.num_sge = 1;
wr.opcode = IB_WR_SEND;
@@ -1769,8 +1806,10 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
list.length = iu->size;
list.lkey = target->lkey;
+ iu->cqe.done = srp_recv_done;
+
wr.next = NULL;
- wr.wr_id = (uintptr_t) iu;
+ wr.wr_cqe = &iu->cqe;
wr.sg_list = &list;
wr.num_sge = 1;
@@ -1902,14 +1941,20 @@ static void srp_process_aer_req(struct srp_rdma_ch *ch,
"problems processing SRP_AER_REQ\n");
}
-static void srp_handle_recv(struct srp_rdma_ch *ch, struct ib_wc *wc)
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+ struct srp_rdma_ch *ch = cq->cq_context;
struct srp_target_port *target = ch->target;
struct ib_device *dev = target->srp_host->srp_dev->dev;
- struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;
int res;
u8 opcode;
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ srp_handle_qp_err(cq, wc, "RECV");
+ return;
+ }
+
ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len,
DMA_FROM_DEVICE);
@@ -1972,68 +2017,22 @@ static void srp_tl_err_work(struct work_struct *work)
srp_start_tl_fail_timers(target->rport);
}
-static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status,
- bool send_err, struct srp_rdma_ch *ch)
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+ const char *opname)
{
+ struct srp_rdma_ch *ch = cq->cq_context;
struct srp_target_port *target = ch->target;
- if (wr_id == SRP_LAST_WR_ID) {
- complete(&ch->done);
- return;
- }
-
if (ch->connected && !target->qp_in_error) {
- if (wr_id & LOCAL_INV_WR_ID_MASK) {
- shost_printk(KERN_ERR, target->scsi_host, PFX
- "LOCAL_INV failed with status %s (%d)\n",
- ib_wc_status_msg(wc_status), wc_status);
- } else if (wr_id & FAST_REG_WR_ID_MASK) {
- shost_printk(KERN_ERR, target->scsi_host, PFX
- "FAST_REG_MR failed status %s (%d)\n",
- ib_wc_status_msg(wc_status), wc_status);
- } else {
- shost_printk(KERN_ERR, target->scsi_host,
- PFX "failed %s status %s (%d) for iu %p\n",
- send_err ? "send" : "receive",
- ib_wc_status_msg(wc_status), wc_status,
- (void *)(uintptr_t)wr_id);
- }
+ shost_printk(KERN_ERR, target->scsi_host,
+ PFX "failed %s status %s (%d) for CQE %p\n",
+ opname, ib_wc_status_msg(wc->status), wc->status,
+ wc->wr_cqe);
queue_work(system_long_wq, &target->tl_err_work);
}
target->qp_in_error = true;
}
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr)
-{
- struct srp_rdma_ch *ch = ch_ptr;
- struct ib_wc wc;
-
- ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- if (likely(wc.status == IB_WC_SUCCESS)) {
- srp_handle_recv(ch, &wc);
- } else {
- srp_handle_qp_err(wc.wr_id, wc.status, false, ch);
- }
- }
-}
-
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr)
-{
- struct srp_rdma_ch *ch = ch_ptr;
- struct ib_wc wc;
- struct srp_iu *iu;
-
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- if (likely(wc.status == IB_WC_SUCCESS)) {
- iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
- list_add(&iu->list, &ch->free_tx);
- } else {
- srp_handle_qp_err(wc.wr_id, wc.status, true, ch);
- }
- }
-}
-
static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
{
struct srp_target_port *target = host_to_target(shost);
@@ -3439,27 +3438,17 @@ free_host:
static void srp_add_one(struct ib_device *device)
{
struct srp_device *srp_dev;
- struct ib_device_attr *dev_attr;
struct srp_host *host;
int mr_page_shift, p;
u64 max_pages_per_mr;
- dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
- if (!dev_attr)
- return;
-
- if (ib_query_device(device, dev_attr)) {
- pr_warn("Query device failed for %s\n", device->name);
- goto free_attr;
- }
-
srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL);
if (!srp_dev)
- goto free_attr;
+ return;
srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
device->map_phys_fmr && device->unmap_fmr);
- srp_dev->has_fr = (dev_attr->device_cap_flags &
+ srp_dev->has_fr = (device->attrs.device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS);
if (!srp_dev->has_fmr && !srp_dev->has_fr)
dev_warn(&device->dev, "neither FMR nor FR is supported\n");
@@ -3473,23 +3462,23 @@ static void srp_add_one(struct ib_device *device)
* minimum of 4096 bytes. We're unlikely to build large sglists
* out of smaller entries.
*/
- mr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1);
+ mr_page_shift = max(12, ffs(device->attrs.page_size_cap) - 1);
srp_dev->mr_page_size = 1 << mr_page_shift;
srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1);
- max_pages_per_mr = dev_attr->max_mr_size;
+ max_pages_per_mr = device->attrs.max_mr_size;
do_div(max_pages_per_mr, srp_dev->mr_page_size);
srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
max_pages_per_mr);
if (srp_dev->use_fast_reg) {
srp_dev->max_pages_per_mr =
min_t(u32, srp_dev->max_pages_per_mr,
- dev_attr->max_fast_reg_page_list_len);
+ device->attrs.max_fast_reg_page_list_len);
}
srp_dev->mr_max_size = srp_dev->mr_page_size *
srp_dev->max_pages_per_mr;
- pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
- device->name, mr_page_shift, dev_attr->max_mr_size,
- dev_attr->max_fast_reg_page_list_len,
+ pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
+ device->name, mr_page_shift, device->attrs.max_mr_size,
+ device->attrs.max_fast_reg_page_list_len,
srp_dev->max_pages_per_mr, srp_dev->mr_max_size);
INIT_LIST_HEAD(&srp_dev->dev_list);
@@ -3517,17 +3506,13 @@ static void srp_add_one(struct ib_device *device)
}
ib_set_client_data(device, &srp_client, srp_dev);
-
- goto free_attr;
+ return;
err_pd:
ib_dealloc_pd(srp_dev->pd);
free_dev:
kfree(srp_dev);
-
-free_attr:
- kfree(dev_attr);
}
static void srp_remove_one(struct ib_device *device, void *client_data)
@@ -3587,8 +3572,6 @@ static int __init srp_init_module(void)
{
int ret;
- BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *));
-
if (srp_sg_tablesize) {
pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
if (!cmd_sg_entries)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index f6af531..9e05ce4 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -66,11 +66,6 @@ enum {
SRP_TAG_TSK_MGMT = 1U << 31,
SRP_MAX_PAGES_PER_MR = 512,
-
- LOCAL_INV_WR_ID_MASK = 1,
- FAST_REG_WR_ID_MASK = 2,
-
- SRP_LAST_WR_ID = 0xfffffffcU,
};
enum srp_target_state {
@@ -128,6 +123,7 @@ struct srp_request {
struct srp_direct_buf *indirect_desc;
dma_addr_t indirect_dma_addr;
short nmdesc;
+ struct ib_cqe reg_cqe;
};
/**
@@ -231,6 +227,7 @@ struct srp_iu {
void *buf;
size_t size;
enum dma_data_direction direction;
+ struct ib_cqe cqe;
};
/**
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index bc5470c..0c37fee 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -93,6 +93,8 @@ MODULE_PARM_DESC(srpt_service_guid,
static struct ib_client srpt_client;
static void srpt_release_channel(struct srpt_rdma_ch *ch);
static int srpt_queue_status(struct se_cmd *cmd);
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc);
/**
* opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE.
@@ -341,10 +343,10 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot,
memset(iocp, 0, sizeof *iocp);
strcpy(iocp->id_string, SRPT_ID_STRING);
iocp->guid = cpu_to_be64(srpt_service_guid);
- iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
- iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id);
- iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver);
- iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
+ iocp->vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
+ iocp->device_id = cpu_to_be32(sdev->device->attrs.vendor_part_id);
+ iocp->device_version = cpu_to_be16(sdev->device->attrs.hw_ver);
+ iocp->subsys_vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
iocp->subsys_device_id = 0x0;
iocp->io_class = cpu_to_be16(SRP_REV16A_IB_IO_CLASS);
iocp->io_subclass = cpu_to_be16(SRP_IO_SUBCLASS);
@@ -453,6 +455,7 @@ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
* srpt_mad_recv_handler() - MAD reception callback function.
*/
static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_wc)
{
struct srpt_port *sport = (struct srpt_port *)mad_agent->context;
@@ -778,12 +781,12 @@ static int srpt_post_recv(struct srpt_device *sdev,
struct ib_recv_wr wr, *bad_wr;
BUG_ON(!sdev);
- wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index);
-
list.addr = ioctx->ioctx.dma;
list.length = srp_max_req_size;
list.lkey = sdev->pd->local_dma_lkey;
+ ioctx->ioctx.cqe.done = srpt_recv_done;
+ wr.wr_cqe = &ioctx->ioctx.cqe;
wr.next = NULL;
wr.sg_list = &list;
wr.num_sge = 1;
@@ -819,8 +822,9 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
list.length = len;
list.lkey = sdev->pd->local_dma_lkey;
+ ioctx->ioctx.cqe.done = srpt_send_done;
wr.next = NULL;
- wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
+ wr.wr_cqe = &ioctx->ioctx.cqe;
wr.sg_list = &list;
wr.num_sge = 1;
wr.opcode = IB_WR_SEND;
@@ -1052,13 +1056,13 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
BUG_ON(!ch);
BUG_ON(!ioctx);
- BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
+ BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs);
while (ioctx->n_rdma)
- kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+ kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list);
- kfree(ioctx->rdma_ius);
- ioctx->rdma_ius = NULL;
+ kfree(ioctx->rdma_wrs);
+ ioctx->rdma_wrs = NULL;
if (ioctx->mapped_sg_count) {
sg = ioctx->sg;
@@ -1082,7 +1086,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
struct scatterlist *sg, *sg_orig;
int sg_cnt;
enum dma_data_direction dir;
- struct rdma_iu *riu;
+ struct ib_rdma_wr *riu;
struct srp_direct_buf *db;
dma_addr_t dma_addr;
struct ib_sge *sge;
@@ -1109,23 +1113,24 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
ioctx->mapped_sg_count = count;
- if (ioctx->rdma_ius && ioctx->n_rdma_ius)
- nrdma = ioctx->n_rdma_ius;
+ if (ioctx->rdma_wrs && ioctx->n_rdma_wrs)
+ nrdma = ioctx->n_rdma_wrs;
else {
nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
+ ioctx->n_rbuf;
- ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
- if (!ioctx->rdma_ius)
+ ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs),
+ GFP_KERNEL);
+ if (!ioctx->rdma_wrs)
goto free_mem;
- ioctx->n_rdma_ius = nrdma;
+ ioctx->n_rdma_wrs = nrdma;
}
db = ioctx->rbufs;
tsize = cmd->data_length;
dma_len = ib_sg_dma_len(dev, &sg[0]);
- riu = ioctx->rdma_ius;
+ riu = ioctx->rdma_wrs;
/*
* For each remote desc - calculate the #ib_sge.
@@ -1139,9 +1144,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
rsize = be32_to_cpu(db->len);
raddr = be64_to_cpu(db->va);
- riu->raddr = raddr;
+ riu->remote_addr = raddr;
riu->rkey = be32_to_cpu(db->key);
- riu->sge_cnt = 0;
+ riu->wr.num_sge = 0;
/* calculate how many sge required for this remote_buf */
while (rsize > 0 && tsize > 0) {
@@ -1165,33 +1170,35 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
rsize = 0;
}
- ++riu->sge_cnt;
+ ++riu->wr.num_sge;
- if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+ if (rsize > 0 &&
+ riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
++ioctx->n_rdma;
- riu->sge =
- kmalloc(riu->sge_cnt * sizeof *riu->sge,
- GFP_KERNEL);
- if (!riu->sge)
+ riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+ sizeof(*riu->wr.sg_list),
+ GFP_KERNEL);
+ if (!riu->wr.sg_list)
goto free_mem;
++riu;
- riu->sge_cnt = 0;
- riu->raddr = raddr;
+ riu->wr.num_sge = 0;
+ riu->remote_addr = raddr;
riu->rkey = be32_to_cpu(db->key);
}
}
++ioctx->n_rdma;
- riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
- GFP_KERNEL);
- if (!riu->sge)
+ riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+ sizeof(*riu->wr.sg_list),
+ GFP_KERNEL);
+ if (!riu->wr.sg_list)
goto free_mem;
}
db = ioctx->rbufs;
tsize = cmd->data_length;
- riu = ioctx->rdma_ius;
+ riu = ioctx->rdma_wrs;
sg = sg_orig;
dma_len = ib_sg_dma_len(dev, &sg[0]);
dma_addr = ib_sg_dma_address(dev, &sg[0]);
@@ -1200,7 +1207,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
for (i = 0, j = 0;
j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
rsize = be32_to_cpu(db->len);
- sge = riu->sge;
+ sge = riu->wr.sg_list;
k = 0;
while (rsize > 0 && tsize > 0) {
@@ -1232,9 +1239,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
}
++k;
- if (k == riu->sge_cnt && rsize > 0 && tsize > 0) {
+ if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) {
++riu;
- sge = riu->sge;
+ sge = riu->wr.sg_list;
k = 0;
} else if (rsize > 0 && tsize > 0)
++sge;
@@ -1277,8 +1284,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
ioctx->n_rbuf = 0;
ioctx->rbufs = NULL;
ioctx->n_rdma = 0;
- ioctx->n_rdma_ius = 0;
- ioctx->rdma_ius = NULL;
+ ioctx->n_rdma_wrs = 0;
+ ioctx->rdma_wrs = NULL;
ioctx->mapped_sg_count = 0;
init_completion(&ioctx->tx_done);
ioctx->queue_status_only = false;
@@ -1380,118 +1387,44 @@ out:
}
/**
- * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion.
- */
-static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id)
-{
- struct srpt_send_ioctx *ioctx;
- enum srpt_command_state state;
- u32 index;
-
- atomic_inc(&ch->sq_wr_avail);
-
- index = idx_from_wr_id(wr_id);
- ioctx = ch->ioctx_ring[index];
- state = srpt_get_cmd_state(ioctx);
-
- WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
- && state != SRPT_STATE_MGMT_RSP_SENT
- && state != SRPT_STATE_NEED_DATA
- && state != SRPT_STATE_DONE);
-
- /* If SRP_RSP sending failed, undo the ch->req_lim change. */
- if (state == SRPT_STATE_CMD_RSP_SENT
- || state == SRPT_STATE_MGMT_RSP_SENT)
- atomic_dec(&ch->req_lim);
-
- srpt_abort_cmd(ioctx);
-}
-
-/**
- * srpt_handle_send_comp() - Process an IB send completion notification.
- */
-static void srpt_handle_send_comp(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx)
-{
- enum srpt_command_state state;
-
- atomic_inc(&ch->sq_wr_avail);
-
- state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
-
- if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
- && state != SRPT_STATE_MGMT_RSP_SENT
- && state != SRPT_STATE_DONE))
- pr_debug("state = %d\n", state);
-
- if (state != SRPT_STATE_DONE) {
- srpt_unmap_sg_to_ib_sge(ch, ioctx);
- transport_generic_free_cmd(&ioctx->cmd, 0);
- } else {
- pr_err("IB completion has been received too late for"
- " wr_id = %u.\n", ioctx->ioctx.index);
- }
-}
-
-/**
- * srpt_handle_rdma_comp() - Process an IB RDMA completion notification.
- *
* XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
* the data that has been transferred via IB RDMA had to be postponed until the
* check_stop_free() callback. None of this is necessary anymore and needs to
* be cleaned up.
*/
-static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx,
- enum srpt_opcode opcode)
+static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct srpt_rdma_ch *ch = cq->cq_context;
+ struct srpt_send_ioctx *ioctx =
+ container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
+
WARN_ON(ioctx->n_rdma <= 0);
atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
- if (opcode == SRPT_RDMA_READ_LAST) {
- if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
- SRPT_STATE_DATA_IN))
- target_execute_cmd(&ioctx->cmd);
- else
- pr_err("%s[%d]: wrong state = %d\n", __func__,
- __LINE__, srpt_get_cmd_state(ioctx));
- } else if (opcode == SRPT_RDMA_ABORT) {
- ioctx->rdma_aborted = true;
- } else {
- WARN(true, "unexpected opcode %d\n", opcode);
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n",
+ ioctx, wc->status);
+ srpt_abort_cmd(ioctx);
+ return;
}
+
+ if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
+ SRPT_STATE_DATA_IN))
+ target_execute_cmd(&ioctx->cmd);
+ else
+ pr_err("%s[%d]: wrong state = %d\n", __func__,
+ __LINE__, srpt_get_cmd_state(ioctx));
}
-/**
- * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion.
- */
-static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx,
- enum srpt_opcode opcode)
+static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
{
- enum srpt_command_state state;
+ struct srpt_send_ioctx *ioctx =
+ container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
- state = srpt_get_cmd_state(ioctx);
- switch (opcode) {
- case SRPT_RDMA_READ_LAST:
- if (ioctx->n_rdma <= 0) {
- pr_err("Received invalid RDMA read"
- " error completion with idx %d\n",
- ioctx->ioctx.index);
- break;
- }
- atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
- if (state == SRPT_STATE_NEED_DATA)
- srpt_abort_cmd(ioctx);
- else
- pr_err("%s[%d]: wrong state = %d\n",
- __func__, __LINE__, state);
- break;
- case SRPT_RDMA_WRITE_LAST:
- break;
- default:
- pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode);
- break;
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n",
+ ioctx, wc->status);
+ srpt_abort_cmd(ioctx);
}
}
@@ -1926,32 +1859,26 @@ out:
return;
}
-static void srpt_process_rcv_completion(struct ib_cq *cq,
- struct srpt_rdma_ch *ch,
- struct ib_wc *wc)
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct srpt_device *sdev = ch->sport->sdev;
- struct srpt_recv_ioctx *ioctx;
- u32 index;
+ struct srpt_rdma_ch *ch = cq->cq_context;
+ struct srpt_recv_ioctx *ioctx =
+ container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe);
- index = idx_from_wr_id(wc->wr_id);
if (wc->status == IB_WC_SUCCESS) {
int req_lim;
req_lim = atomic_dec_return(&ch->req_lim);
if (unlikely(req_lim < 0))
pr_err("req_lim = %d < 0\n", req_lim);
- ioctx = sdev->ioctx_ring[index];
srpt_handle_new_iu(ch, ioctx, NULL);
} else {
- pr_info("receiving failed for idx %u with status %d\n",
- index, wc->status);
+ pr_info("receiving failed for ioctx %p with status %d\n",
+ ioctx, wc->status);
}
}
/**
- * srpt_process_send_completion() - Process an IB send completion.
- *
* Note: Although this has not yet been observed during tests, at least in
* theory it is possible that the srpt_get_send_ioctx() call invoked by
* srpt_handle_new_iu() fails. This is possible because the req_lim_delta
@@ -1964,109 +1891,52 @@ static void srpt_process_rcv_completion(struct ib_cq *cq,
* are queued on cmd_wait_list. The code below processes these delayed
* requests one at a time.
*/
-static void srpt_process_send_completion(struct ib_cq *cq,
- struct srpt_rdma_ch *ch,
- struct ib_wc *wc)
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct srpt_send_ioctx *send_ioctx;
- uint32_t index;
- enum srpt_opcode opcode;
+ struct srpt_rdma_ch *ch = cq->cq_context;
+ struct srpt_send_ioctx *ioctx =
+ container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
+ enum srpt_command_state state;
- index = idx_from_wr_id(wc->wr_id);
- opcode = opcode_from_wr_id(wc->wr_id);
- send_ioctx = ch->ioctx_ring[index];
- if (wc->status == IB_WC_SUCCESS) {
- if (opcode == SRPT_SEND)
- srpt_handle_send_comp(ch, send_ioctx);
- else {
- WARN_ON(opcode != SRPT_RDMA_ABORT &&
- wc->opcode != IB_WC_RDMA_READ);
- srpt_handle_rdma_comp(ch, send_ioctx, opcode);
- }
+ state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+
+ WARN_ON(state != SRPT_STATE_CMD_RSP_SENT &&
+ state != SRPT_STATE_MGMT_RSP_SENT);
+
+ atomic_inc(&ch->sq_wr_avail);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ pr_info("sending response for ioctx 0x%p failed"
+ " with status %d\n", ioctx, wc->status);
+
+ atomic_dec(&ch->req_lim);
+ srpt_abort_cmd(ioctx);
+ goto out;
+ }
+
+ if (state != SRPT_STATE_DONE) {
+ srpt_unmap_sg_to_ib_sge(ch, ioctx);
+ transport_generic_free_cmd(&ioctx->cmd, 0);
} else {
- if (opcode == SRPT_SEND) {
- pr_info("sending response for idx %u failed"
- " with status %d\n", index, wc->status);
- srpt_handle_send_err_comp(ch, wc->wr_id);
- } else if (opcode != SRPT_RDMA_MID) {
- pr_info("RDMA t %d for idx %u failed with"
- " status %d\n", opcode, index, wc->status);
- srpt_handle_rdma_err_comp(ch, send_ioctx, opcode);
- }
+ pr_err("IB completion has been received too late for"
+ " wr_id = %u.\n", ioctx->ioctx.index);
}
- while (unlikely(opcode == SRPT_SEND
- && !list_empty(&ch->cmd_wait_list)
- && srpt_get_ch_state(ch) == CH_LIVE
- && (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) {
+out:
+ while (!list_empty(&ch->cmd_wait_list) &&
+ srpt_get_ch_state(ch) == CH_LIVE &&
+ (ioctx = srpt_get_send_ioctx(ch)) != NULL) {
struct srpt_recv_ioctx *recv_ioctx;
recv_ioctx = list_first_entry(&ch->cmd_wait_list,
struct srpt_recv_ioctx,
wait_list);
list_del(&recv_ioctx->wait_list);
- srpt_handle_new_iu(ch, recv_ioctx, send_ioctx);
- }
-}
-
-static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch)
-{
- struct ib_wc *const wc = ch->wc;
- int i, n;
-
- WARN_ON(cq != ch->cq);
-
- ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) {
- for (i = 0; i < n; i++) {
- if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV)
- srpt_process_rcv_completion(cq, ch, &wc[i]);
- else
- srpt_process_send_completion(cq, ch, &wc[i]);
- }
+ srpt_handle_new_iu(ch, recv_ioctx, ioctx);
}
}
/**
- * srpt_completion() - IB completion queue callback function.
- *
- * Notes:
- * - It is guaranteed that a completion handler will never be invoked
- * concurrently on two different CPUs for the same completion queue. See also
- * Documentation/infiniband/core_locking.txt and the implementation of
- * handle_edge_irq() in kernel/irq/chip.c.
- * - When threaded IRQs are enabled, completion handlers are invoked in thread
- * context instead of interrupt context.
- */
-static void srpt_completion(struct ib_cq *cq, void *ctx)
-{
- struct srpt_rdma_ch *ch = ctx;
-
- wake_up_interruptible(&ch->wait_queue);
-}
-
-static int srpt_compl_thread(void *arg)
-{
- struct srpt_rdma_ch *ch;
-
- /* Hibernation / freezing of the SRPT kernel thread is not supported. */
- current->flags |= PF_NOFREEZE;
-
- ch = arg;
- BUG_ON(!ch);
- pr_info("Session %s: kernel thread %s (PID %d) started\n",
- ch->sess_name, ch->thread->comm, current->pid);
- while (!kthread_should_stop()) {
- wait_event_interruptible(ch->wait_queue,
- (srpt_process_completion(ch->cq, ch),
- kthread_should_stop()));
- }
- pr_info("Session %s: kernel thread %s (PID %d) stopped\n",
- ch->sess_name, ch->thread->comm, current->pid);
- return 0;
-}
-
-/**
* srpt_create_ch_ib() - Create receive and send completion queues.
*/
static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
@@ -2075,7 +1945,6 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
struct srpt_port *sport = ch->sport;
struct srpt_device *sdev = sport->sdev;
u32 srp_sq_size = sport->port_attrib.srp_sq_size;
- struct ib_cq_init_attr cq_attr = {};
int ret;
WARN_ON(ch->rq_size < 1);
@@ -2086,9 +1955,8 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
goto out;
retry:
- cq_attr.cqe = ch->rq_size + srp_sq_size;
- ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch,
- &cq_attr);
+ ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + srp_sq_size,
+ 0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
if (IS_ERR(ch->cq)) {
ret = PTR_ERR(ch->cq);
pr_err("failed to create CQ cqe= %d ret= %d\n",
@@ -2131,18 +1999,6 @@ retry:
if (ret)
goto err_destroy_qp;
- init_waitqueue_head(&ch->wait_queue);
-
- pr_debug("creating thread for session %s\n", ch->sess_name);
-
- ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl");
- if (IS_ERR(ch->thread)) {
- pr_err("failed to create kernel thread %ld\n",
- PTR_ERR(ch->thread));
- ch->thread = NULL;
- goto err_destroy_qp;
- }
-
out:
kfree(qp_init);
return ret;
@@ -2150,17 +2006,14 @@ out:
err_destroy_qp:
ib_destroy_qp(ch->qp);
err_destroy_cq:
- ib_destroy_cq(ch->cq);
+ ib_free_cq(ch->cq);
goto out;
}
static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
{
- if (ch->thread)
- kthread_stop(ch->thread);
-
ib_destroy_qp(ch->qp);
- ib_destroy_cq(ch->cq);
+ ib_free_cq(ch->cq);
}
/**
@@ -2808,12 +2661,8 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
struct srpt_send_ioctx *ioctx)
{
- struct ib_rdma_wr wr;
struct ib_send_wr *bad_wr;
- struct rdma_iu *riu;
- int i;
- int ret;
- int sq_wr_avail;
+ int sq_wr_avail, ret, i;
enum dma_data_direction dir;
const int n_rdma = ioctx->n_rdma;
@@ -2829,59 +2678,32 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
}
}
- ioctx->rdma_aborted = false;
- ret = 0;
- riu = ioctx->rdma_ius;
- memset(&wr, 0, sizeof wr);
-
- for (i = 0; i < n_rdma; ++i, ++riu) {
- if (dir == DMA_FROM_DEVICE) {
- wr.wr.opcode = IB_WR_RDMA_WRITE;
- wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
- SRPT_RDMA_WRITE_LAST :
- SRPT_RDMA_MID,
- ioctx->ioctx.index);
- } else {
- wr.wr.opcode = IB_WR_RDMA_READ;
- wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
- SRPT_RDMA_READ_LAST :
- SRPT_RDMA_MID,
- ioctx->ioctx.index);
- }
- wr.wr.next = NULL;
- wr.remote_addr = riu->raddr;
- wr.rkey = riu->rkey;
- wr.wr.num_sge = riu->sge_cnt;
- wr.wr.sg_list = riu->sge;
+ for (i = 0; i < n_rdma; i++) {
+ struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr;
- /* only get completion event for the last rdma write */
- if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE)
- wr.wr.send_flags = IB_SEND_SIGNALED;
+ wr->opcode = (dir == DMA_FROM_DEVICE) ?
+ IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
- ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
- if (ret)
- break;
+ if (i == n_rdma - 1) {
+ /* only get completion event for the last rdma read */
+ if (dir == DMA_TO_DEVICE) {
+ wr->send_flags = IB_SEND_SIGNALED;
+ ioctx->rdma_cqe.done = srpt_rdma_read_done;
+ } else {
+ ioctx->rdma_cqe.done = srpt_rdma_write_done;
+ }
+ wr->wr_cqe = &ioctx->rdma_cqe;
+ wr->next = NULL;
+ } else {
+ wr->wr_cqe = NULL;
+ wr->next = &ioctx->rdma_wrs[i + 1].wr;
+ }
}
+ ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr);
if (ret)
pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
__func__, __LINE__, ret, i, n_rdma);
- if (ret && i > 0) {
- wr.wr.num_sge = 0;
- wr.wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index);
- wr.wr.send_flags = IB_SEND_SIGNALED;
- while (ch->state == CH_LIVE &&
- ib_post_send(ch->qp, &wr.wr, &bad_wr) != 0) {
- pr_info("Trying to abort failed RDMA transfer [%d]\n",
- ioctx->ioctx.index);
- msleep(1000);
- }
- while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) {
- pr_info("Waiting until RDMA abort finished [%d]\n",
- ioctx->ioctx.index);
- msleep(1000);
- }
- }
out:
if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
atomic_add(n_rdma, &ch->sq_wr_avail);
@@ -3190,14 +3012,11 @@ static void srpt_add_one(struct ib_device *device)
init_waitqueue_head(&sdev->ch_releaseQ);
spin_lock_init(&sdev->spinlock);
- if (ib_query_device(device, &sdev->dev_attr))
- goto free_dev;
-
sdev->pd = ib_alloc_pd(device);
if (IS_ERR(sdev->pd))
goto free_dev;
- sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
+ sdev->srq_size = min(srpt_srq_size, sdev->device->attrs.max_srq_wr);
srq_attr.event_handler = srpt_srq_event;
srq_attr.srq_context = (void *)sdev;
@@ -3211,7 +3030,7 @@ static void srpt_add_one(struct ib_device *device)
goto err_pd;
pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
- __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
+ __func__, sdev->srq_size, sdev->device->attrs.max_srq_wr,
device->name);
if (!srpt_service_guid)
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 5366e0a..09037f2b 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -128,36 +128,6 @@ enum {
DEFAULT_MAX_RDMA_SIZE = 65536,
};
-enum srpt_opcode {
- SRPT_RECV,
- SRPT_SEND,
- SRPT_RDMA_MID,
- SRPT_RDMA_ABORT,
- SRPT_RDMA_READ_LAST,
- SRPT_RDMA_WRITE_LAST,
-};
-
-static inline u64 encode_wr_id(u8 opcode, u32 idx)
-{
- return ((u64)opcode << 32) | idx;
-}
-static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id)
-{
- return wr_id >> 32;
-}
-static inline u32 idx_from_wr_id(u64 wr_id)
-{
- return (u32)wr_id;
-}
-
-struct rdma_iu {
- u64 raddr;
- u32 rkey;
- struct ib_sge *sge;
- u32 sge_cnt;
- int mem_id;
-};
-
/**
* enum srpt_command_state - SCSI command state managed by SRPT.
* @SRPT_STATE_NEW: New command arrived and is being processed.
@@ -189,6 +159,7 @@ enum srpt_command_state {
* @index: Index of the I/O context in its ioctx_ring array.
*/
struct srpt_ioctx {
+ struct ib_cqe cqe;
void *buf;
dma_addr_t dma;
uint32_t index;
@@ -215,32 +186,30 @@ struct srpt_recv_ioctx {
* @sg: Pointer to sg-list associated with this I/O context.
* @sg_cnt: SG-list size.
* @mapped_sg_count: ib_dma_map_sg() return value.
- * @n_rdma_ius: Number of elements in the rdma_ius array.
- * @rdma_ius: Array with information about the RDMA mapping.
+ * @n_rdma_wrs: Number of elements in the rdma_wrs array.
+ * @rdma_wrs: Array with information about the RDMA mapping.
* @tag: Tag of the received SRP information unit.
* @spinlock: Protects 'state'.
* @state: I/O context state.
- * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether
- * the already initiated transfers have finished.
* @cmd: Target core command data structure.
* @sense_data: SCSI sense data.
*/
struct srpt_send_ioctx {
struct srpt_ioctx ioctx;
struct srpt_rdma_ch *ch;
- struct rdma_iu *rdma_ius;
+ struct ib_rdma_wr *rdma_wrs;
+ struct ib_cqe rdma_cqe;
struct srp_direct_buf *rbufs;
struct srp_direct_buf single_rbuf;
struct scatterlist *sg;
struct list_head free_list;
spinlock_t spinlock;
enum srpt_command_state state;
- bool rdma_aborted;
struct se_cmd cmd;
struct completion tx_done;
int sg_cnt;
int mapped_sg_count;
- u16 n_rdma_ius;
+ u16 n_rdma_wrs;
u8 n_rdma;
u8 n_rbuf;
bool queue_status_only;
@@ -267,9 +236,6 @@ enum rdma_ch_state {
/**
* struct srpt_rdma_ch - RDMA channel.
- * @wait_queue: Allows the kernel thread to wait for more work.
- * @thread: Kernel thread that processes the IB queues associated with
- * the channel.
* @cm_id: IB CM ID associated with the channel.
* @qp: IB queue pair used for communicating over this channel.
* @cq: IB completion queue for this channel.
@@ -288,7 +254,6 @@ enum rdma_ch_state {
* @free_list: Head of list with free send I/O contexts.
* @state: channel state. See also enum rdma_ch_state.
* @ioctx_ring: Send ring.
- * @wc: IB work completion array for srpt_process_completion().
* @list: Node for insertion in the srpt_device.rch_list list.
* @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
* list contains struct srpt_ioctx elements and is protected
@@ -299,8 +264,6 @@ enum rdma_ch_state {
* @release_done: Enables waiting for srpt_release_channel() completion.
*/
struct srpt_rdma_ch {
- wait_queue_head_t wait_queue;
- struct task_struct *thread;
struct ib_cm_id *cm_id;
struct ib_qp *qp;
struct ib_cq *cq;
@@ -317,7 +280,6 @@ struct srpt_rdma_ch {
struct list_head free_list;
enum rdma_ch_state state;
struct srpt_send_ioctx **ioctx_ring;
- struct ib_wc wc[16];
struct list_head list;
struct list_head cmd_wait_list;
struct se_session *sess;
@@ -377,8 +339,6 @@ struct srpt_port {
* @mr: L_Key (local key) with write access to all local memory.
* @srq: Per-HCA SRQ (shared receive queue).
* @cm_id: Connection identifier.
- * @dev_attr: Attributes of the InfiniBand device as obtained during the
- * ib_client.add() callback.
* @srq_size: SRQ size.
* @ioctx_ring: Per-HCA SRQ.
* @rch_list: Per-device channel list -- see also srpt_rdma_ch.list.
@@ -393,7 +353,6 @@ struct srpt_device {
struct ib_pd *pd;
struct ib_srq *srq;
struct ib_cm_id *cm_id;
- struct ib_device_attr dev_attr;
int srq_size;
struct srpt_recv_ioctx **ioctx_ring;
struct list_head rch_list;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 2c2baab..d66c690 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -157,6 +157,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
[29] = "802.1ad offload support",
[31] = "Modifying loopback source checks using UPDATE_QP support",
[32] = "Loopback source checks support",
+ [33] = "RoCEv2 support"
};
int i;
@@ -626,6 +627,8 @@ out:
return err;
}
+static void disable_unsupported_roce_caps(void *buf);
+
int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
{
struct mlx4_cmd_mailbox *mailbox;
@@ -738,6 +741,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
if (err)
goto out;
+ if (mlx4_is_mfunc(dev))
+ disable_unsupported_roce_caps(outbox);
MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
dev_cap->reserved_qps = 1 << (field & 0xf);
MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
@@ -905,6 +910,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
MLX4_GET(dev_cap->bmme_flags, outbox,
QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+ if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2)
+ dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
@@ -1161,6 +1168,7 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
if (err)
return err;
+ disable_unsupported_roce_caps(outbox->buf);
/* add port mng change event capability and disable mw type 1
* unconditionally to slaves
*/
@@ -1258,6 +1266,21 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
return 0;
}
+static void disable_unsupported_roce_caps(void *buf)
+{
+ u32 flags;
+
+ MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+ flags &= ~(1UL << 31);
+ MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+ MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+ flags &= ~(1UL << 24);
+ MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+ MLX4_GET(flags, buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+ flags &= ~(MLX4_FLAG_ROCE_V1_V2);
+ MLX4_PUT(buf, flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+}
+
int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
struct mlx4_vhcr *vhcr,
struct mlx4_cmd_mailbox *inbox,
@@ -2239,7 +2262,8 @@ struct mlx4_config_dev {
__be32 rsvd1[3];
__be16 vxlan_udp_dport;
__be16 rsvd2;
- __be32 rsvd3;
+ __be16 roce_v2_entropy;
+ __be16 roce_v2_udp_dport;
__be32 roce_flags;
__be32 rsvd4[25];
__be16 rsvd5;
@@ -2248,6 +2272,7 @@ struct mlx4_config_dev {
};
#define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_ROCE_V2_UDP_DPORT BIT(3)
#define MLX4_DISABLE_RX_PORT BIT(18)
static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
@@ -2365,6 +2390,18 @@ int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
return mlx4_CONFIG_DEV_set(dev, &config_dev);
}
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port)
+{
+ struct mlx4_config_dev config_dev;
+
+ memset(&config_dev, 0, sizeof(config_dev));
+ config_dev.update_flags = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT);
+ config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port);
+
+ return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port);
+
int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
{
struct mlx4_cmd_mailbox *mailbox;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 2404c22..7baef52 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -780,7 +780,10 @@ struct mlx4_set_port_general_context {
u16 reserved1;
u8 v_ignore_fcs;
u8 flags;
- u8 ignore_fcs;
+ union {
+ u8 ignore_fcs;
+ u8 roce_mode;
+ };
u8 reserved2;
__be16 mtu;
u8 pptx;
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index f255042..787b7bb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -1520,6 +1520,8 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
return err;
}
+#define SET_PORT_ROCE_2_FLAGS 0x10
+#define MLX4_SET_PORT_ROCE_V1_V2 0x2
int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
{
@@ -1539,6 +1541,11 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
context->pprx = (pprx * (!pfcrx)) << 7;
context->pfcrx = pfcrx;
+ if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+ context->flags |= SET_PORT_ROCE_2_FLAGS;
+ context->roce_mode |=
+ MLX4_SET_PORT_ROCE_V1_V2 << 4;
+ }
in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 168823d..d1cd9c3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -167,6 +167,12 @@ static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
}
+ if ((cur_state == MLX4_QP_STATE_RTR) &&
+ (new_state == MLX4_QP_STATE_RTS) &&
+ dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+ context->roce_entropy =
+ cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
+
*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
memcpy(mailbox->buf + 8, context, sizeof *context);
@@ -921,3 +927,23 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
return 0;
}
EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn)
+{
+ struct mlx4_qp_context context;
+ struct mlx4_qp qp;
+ int err;
+
+ qp.qpn = qpn;
+ err = mlx4_qp_query(dev, &qp, &context);
+ if (!err) {
+ u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff;
+ u16 folded_dst = folded_qp(dest_qpn);
+ u16 folded_src = folded_qp(qpn);
+
+ return (dest_qpn != qpn) ?
+ ((folded_dst ^ folded_src) | 0xC000) :
+ folded_src | 0xC000;
+ }
+ return 0xdead;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9ea49a8..aac071a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -39,8 +39,8 @@
#include <linux/mlx5/qp.h>
#include <linux/mlx5/cq.h>
#include <linux/mlx5/vport.h>
+#include <linux/mlx5/transobj.h>
#include "wq.h"
-#include "transobj.h"
#include "mlx5_core.h"
#define MLX5E_MAX_NUM_TC 8
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c56d91a..6a3e430 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2241,7 +2241,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
goto err_unmap_free_uar;
}
- err = mlx5_alloc_transport_domain(mdev, &priv->tdn);
+ err = mlx5_core_alloc_transport_domain(mdev, &priv->tdn);
if (err) {
mlx5_core_err(mdev, "alloc td failed, %d\n", err);
goto err_dealloc_pd;
@@ -2324,7 +2324,7 @@ err_destroy_mkey:
mlx5_core_destroy_mkey(mdev, &priv->mr);
err_dealloc_transport_domain:
- mlx5_dealloc_transport_domain(mdev, priv->tdn);
+ mlx5_core_dealloc_transport_domain(mdev, priv->tdn);
err_dealloc_pd:
mlx5_core_dealloc_pd(mdev, priv->pdn);
@@ -2356,7 +2356,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
mlx5e_close_drop_rq(priv);
mlx5e_destroy_tises(priv);
mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
- mlx5_dealloc_transport_domain(priv->mdev, priv->tdn);
+ mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn);
mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
free_netdev(netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 23c244a..647a3ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -230,6 +230,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+ rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
mlx5_core_dbg(dev, "event %s(%d) arrived on resource 0x%x\n",
eqe_type_str(eqe->type), eqe->type, rsn);
mlx5_rsc_event(dev, rsn, eqe->type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index b37749a..1545a94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -78,6 +78,11 @@ struct mlx5_device_context {
void *context;
};
+enum {
+ MLX5_ATOMIC_REQ_MODE_BE = 0x0,
+ MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1,
+};
+
static struct mlx5_profile profile[] = {
[0] = {
.mask = 0,
@@ -387,7 +392,7 @@ query_ex:
return err;
}
-static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
+static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod)
{
u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)];
int err;
@@ -395,6 +400,7 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
memset(out, 0, sizeof(out));
MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+ MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1);
err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
if (err)
return err;
@@ -404,6 +410,46 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
return err;
}
+static int handle_hca_cap_atomic(struct mlx5_core_dev *dev)
+{
+ void *set_ctx;
+ void *set_hca_cap;
+ int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+ int req_endianness;
+ int err;
+
+ if (MLX5_CAP_GEN(dev, atomic)) {
+ err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC,
+ HCA_CAP_OPMOD_GET_CUR);
+ if (err)
+ return err;
+ } else {
+ return 0;
+ }
+
+ req_endianness =
+ MLX5_CAP_ATOMIC(dev,
+ supported_atomic_req_8B_endianess_mode_1);
+
+ if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS)
+ return 0;
+
+ set_ctx = kzalloc(set_sz, GFP_KERNEL);
+ if (!set_ctx)
+ return -ENOMEM;
+
+ set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
+
+ /* Set requestor to host endianness */
+ MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianess_mode,
+ MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS);
+
+ err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC);
+
+ kfree(set_ctx);
+ return err;
+}
+
static int handle_hca_cap(struct mlx5_core_dev *dev)
{
void *set_ctx = NULL;
@@ -445,7 +491,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
- err = set_caps(dev, set_ctx, set_sz);
+ err = set_caps(dev, set_ctx, set_sz,
+ MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
query_ex:
kfree(set_ctx);
@@ -667,7 +714,6 @@ clean:
return err;
}
-#ifdef CONFIG_MLX5_CORE_EN
static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
{
u32 query_in[MLX5_ST_SZ_DW(query_issi_in)];
@@ -720,7 +766,6 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
return -ENOTSUPP;
}
-#endif
static int map_bf_area(struct mlx5_core_dev *dev)
{
@@ -966,13 +1011,11 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
goto err_pagealloc_cleanup;
}
-#ifdef CONFIG_MLX5_CORE_EN
err = mlx5_core_set_issi(dev);
if (err) {
dev_err(&pdev->dev, "failed to set issi\n");
goto err_disable_hca;
}
-#endif
err = mlx5_satisfy_startup_pages(dev, 1);
if (err) {
@@ -992,6 +1035,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
goto reclaim_boot_pages;
}
+ err = handle_hca_cap_atomic(dev);
+ if (err) {
+ dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n");
+ goto reclaim_boot_pages;
+ }
+
err = mlx5_satisfy_startup_pages(dev, 0);
if (err) {
dev_err(&pdev->dev, "failed to allocate init pages\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 30e2ba3..def2893 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -36,6 +36,7 @@
#include <linux/mlx5/cmd.h>
#include <linux/mlx5/qp.h>
#include <linux/mlx5/driver.h>
+#include <linux/mlx5/transobj.h>
#include "mlx5_core.h"
@@ -67,6 +68,52 @@ void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common)
complete(&common->free);
}
+static u64 qp_allowed_event_types(void)
+{
+ u64 mask;
+
+ mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) |
+ BIT(MLX5_EVENT_TYPE_COMM_EST) |
+ BIT(MLX5_EVENT_TYPE_SQ_DRAINED) |
+ BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+ BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) |
+ BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) |
+ BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) |
+ BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR);
+
+ return mask;
+}
+
+static u64 rq_allowed_event_types(void)
+{
+ u64 mask;
+
+ mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+ BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+
+ return mask;
+}
+
+static u64 sq_allowed_event_types(void)
+{
+ return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+}
+
+static bool is_event_type_allowed(int rsc_type, int event_type)
+{
+ switch (rsc_type) {
+ case MLX5_EVENT_QUEUE_TYPE_QP:
+ return BIT(event_type) & qp_allowed_event_types();
+ case MLX5_EVENT_QUEUE_TYPE_RQ:
+ return BIT(event_type) & rq_allowed_event_types();
+ case MLX5_EVENT_QUEUE_TYPE_SQ:
+ return BIT(event_type) & sq_allowed_event_types();
+ default:
+ WARN(1, "Event arrived for unknown resource type");
+ return false;
+ }
+}
+
void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
{
struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn);
@@ -75,8 +122,16 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
if (!common)
return;
+ if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) {
+ mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n",
+ event_type, rsn);
+ return;
+ }
+
switch (common->res) {
case MLX5_RES_QP:
+ case MLX5_RES_RQ:
+ case MLX5_RES_SQ:
qp = (struct mlx5_core_qp *)common;
qp->event(qp, event_type);
break;
@@ -177,27 +232,56 @@ void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
}
#endif
+static int create_qprqsq_common(struct mlx5_core_dev *dev,
+ struct mlx5_core_qp *qp,
+ int rsc_type)
+{
+ struct mlx5_qp_table *table = &dev->priv.qp_table;
+ int err;
+
+ qp->common.res = rsc_type;
+ spin_lock_irq(&table->lock);
+ err = radix_tree_insert(&table->tree,
+ qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN),
+ qp);
+ spin_unlock_irq(&table->lock);
+ if (err)
+ return err;
+
+ atomic_set(&qp->common.refcount, 1);
+ init_completion(&qp->common.free);
+ qp->pid = current->pid;
+
+ return 0;
+}
+
+static void destroy_qprqsq_common(struct mlx5_core_dev *dev,
+ struct mlx5_core_qp *qp)
+{
+ struct mlx5_qp_table *table = &dev->priv.qp_table;
+ unsigned long flags;
+
+ spin_lock_irqsave(&table->lock, flags);
+ radix_tree_delete(&table->tree,
+ qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
+ spin_unlock_irqrestore(&table->lock, flags);
+ mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
+ wait_for_completion(&qp->common.free);
+}
+
int mlx5_core_create_qp(struct mlx5_core_dev *dev,
struct mlx5_core_qp *qp,
struct mlx5_create_qp_mbox_in *in,
int inlen)
{
- struct mlx5_qp_table *table = &dev->priv.qp_table;
struct mlx5_create_qp_mbox_out out;
struct mlx5_destroy_qp_mbox_in din;
struct mlx5_destroy_qp_mbox_out dout;
int err;
- void *qpc;
memset(&out, 0, sizeof(out));
in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP);
- if (dev->issi) {
- qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
- /* 0xffffff means we ask to work with cqe version 0 */
- MLX5_SET(qpc, qpc, user_index, 0xffffff);
- }
-
err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
if (err) {
mlx5_core_warn(dev, "ret %d\n", err);
@@ -213,24 +297,16 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
qp->qpn = be32_to_cpu(out.qpn) & 0xffffff;
mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
- qp->common.res = MLX5_RES_QP;
- spin_lock_irq(&table->lock);
- err = radix_tree_insert(&table->tree, qp->qpn, qp);
- spin_unlock_irq(&table->lock);
- if (err) {
- mlx5_core_warn(dev, "err %d\n", err);
+ err = create_qprqsq_common(dev, qp, MLX5_RES_QP);
+ if (err)
goto err_cmd;
- }
err = mlx5_debug_qp_add(dev, qp);
if (err)
mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n",
qp->qpn);
- qp->pid = current->pid;
- atomic_set(&qp->common.refcount, 1);
atomic_inc(&dev->num_qps);
- init_completion(&qp->common.free);
return 0;
@@ -250,18 +326,11 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
{
struct mlx5_destroy_qp_mbox_in in;
struct mlx5_destroy_qp_mbox_out out;
- struct mlx5_qp_table *table = &dev->priv.qp_table;
- unsigned long flags;
int err;
mlx5_debug_qp_remove(dev, qp);
- spin_lock_irqsave(&table->lock, flags);
- radix_tree_delete(&table->tree, qp->qpn);
- spin_unlock_irqrestore(&table->lock, flags);
-
- mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
- wait_for_completion(&qp->common.free);
+ destroy_qprqsq_common(dev, qp);
memset(&in, 0, sizeof(in));
memset(&out, 0, sizeof(out));
@@ -279,59 +348,15 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
}
EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
-int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
- enum mlx5_qp_state new_state,
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation,
struct mlx5_modify_qp_mbox_in *in, int sqd_event,
struct mlx5_core_qp *qp)
{
- static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
- [MLX5_QP_STATE_RST] = {
- [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
- [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
- [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP,
- },
- [MLX5_QP_STATE_INIT] = {
- [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
- [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
- [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP,
- [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP,
- },
- [MLX5_QP_STATE_RTR] = {
- [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
- [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
- [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP,
- },
- [MLX5_QP_STATE_RTS] = {
- [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
- [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
- [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP,
- },
- [MLX5_QP_STATE_SQD] = {
- [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
- [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
- },
- [MLX5_QP_STATE_SQER] = {
- [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
- [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
- [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP,
- },
- [MLX5_QP_STATE_ERR] = {
- [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP,
- [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP,
- }
- };
-
struct mlx5_modify_qp_mbox_out out;
int err = 0;
- u16 op;
-
- if (cur_state >= MLX5_QP_NUM_STATE || new_state >= MLX5_QP_NUM_STATE ||
- !optab[cur_state][new_state])
- return -EINVAL;
memset(&out, 0, sizeof(out));
- op = optab[cur_state][new_state];
- in->hdr.opcode = cpu_to_be16(op);
+ in->hdr.opcode = cpu_to_be16(operation);
in->qpn = cpu_to_be32(qp->qpn);
err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out));
if (err)
@@ -449,3 +474,67 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
}
EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
#endif
+
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+ struct mlx5_core_qp *rq)
+{
+ int err;
+ u32 rqn;
+
+ err = mlx5_core_create_rq(dev, in, inlen, &rqn);
+ if (err)
+ return err;
+
+ rq->qpn = rqn;
+ err = create_qprqsq_common(dev, rq, MLX5_RES_RQ);
+ if (err)
+ goto err_destroy_rq;
+
+ return 0;
+
+err_destroy_rq:
+ mlx5_core_destroy_rq(dev, rq->qpn);
+
+ return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rq_tracked);
+
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+ struct mlx5_core_qp *rq)
+{
+ destroy_qprqsq_common(dev, rq);
+ mlx5_core_destroy_rq(dev, rq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked);
+
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+ struct mlx5_core_qp *sq)
+{
+ int err;
+ u32 sqn;
+
+ err = mlx5_core_create_sq(dev, in, inlen, &sqn);
+ if (err)
+ return err;
+
+ sq->qpn = sqn;
+ err = create_qprqsq_common(dev, sq, MLX5_RES_SQ);
+ if (err)
+ goto err_destroy_sq;
+
+ return 0;
+
+err_destroy_sq:
+ mlx5_core_destroy_sq(dev, sq->qpn);
+
+ return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_sq_tracked);
+
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+ struct mlx5_core_qp *sq)
+{
+ destroy_qprqsq_common(dev, sq);
+ mlx5_core_destroy_sq(dev, sq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
index ffada80..04bc522 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -37,7 +37,7 @@
#include <linux/mlx5/srq.h>
#include <rdma/ib_verbs.h>
#include "mlx5_core.h"
-#include "transobj.h"
+#include <linux/mlx5/transobj.h>
void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
{
@@ -241,8 +241,6 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc));
memcpy(pas, in->pas, pas_size);
- /* 0xffffff means we ask to work with cqe version 0 */
- MLX5_SET(xrc_srqc, xrc_srqc, user_index, 0xffffff);
MLX5_SET(create_xrc_srq_in, create_in, opcode,
MLX5_CMD_OP_CREATE_XRC_SRQ);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index d7068f5..03a5093 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -32,9 +32,9 @@
#include <linux/mlx5/driver.h>
#include "mlx5_core.h"
-#include "transobj.h"
+#include <linux/mlx5/transobj.h>
-int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
{
u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)];
u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)];
@@ -53,8 +53,9 @@ int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
return err;
}
+EXPORT_SYMBOL(mlx5_core_alloc_transport_domain);
-void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
{
u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)];
u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)];
@@ -68,6 +69,7 @@ void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
}
+EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain);
int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
{
@@ -94,6 +96,7 @@ int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
memset(out, 0, sizeof(out));
return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
}
+EXPORT_SYMBOL(mlx5_core_modify_rq);
void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
{
@@ -108,6 +111,18 @@ void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
}
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out)
+{
+ u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0};
+ int outlen = MLX5_ST_SZ_BYTES(query_rq_out);
+
+ MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ);
+ MLX5_SET(query_rq_in, in, rqn, rqn);
+
+ return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_rq);
+
int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn)
{
u32 out[MLX5_ST_SZ_DW(create_sq_out)];
@@ -133,6 +148,7 @@ int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen)
memset(out, 0, sizeof(out));
return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
}
+EXPORT_SYMBOL(mlx5_core_modify_sq);
void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
{
@@ -147,6 +163,18 @@ void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
}
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out)
+{
+ u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0};
+ int outlen = MLX5_ST_SZ_BYTES(query_sq_out);
+
+ MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ);
+ MLX5_SET(query_sq_in, in, sqn, sqn);
+
+ return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_sq);
+
int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *tirn)
{
@@ -162,6 +190,7 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
return err;
}
+EXPORT_SYMBOL(mlx5_core_create_tir);
int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
int inlen)
@@ -187,6 +216,7 @@ void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
}
+EXPORT_SYMBOL(mlx5_core_destroy_tir);
int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *tisn)
@@ -203,6 +233,19 @@ int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
return err;
}
+EXPORT_SYMBOL(mlx5_core_create_tis);
+
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+ int inlen)
+{
+ u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0};
+
+ MLX5_SET(modify_tis_in, in, tisn, tisn);
+ MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS);
+
+ return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_tis);
void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
{
@@ -216,6 +259,7 @@ void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
}
+EXPORT_SYMBOL(mlx5_core_destroy_tis);
int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *rmpn)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 076197e..c7398b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -76,7 +76,7 @@ u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
return MLX5_GET(query_vport_state_out, out, admin_state);
}
-EXPORT_SYMBOL(mlx5_query_vport_admin_state);
+EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state);
int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 state)
@@ -104,7 +104,7 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
return err;
}
-EXPORT_SYMBOL(mlx5_modify_vport_admin_state);
+EXPORT_SYMBOL_GPL(mlx5_modify_vport_admin_state);
static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
u32 *out, int outlen)
@@ -151,12 +151,9 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
nic_vport_context.permanent_address);
err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
- if (err)
- goto out;
-
- ether_addr_copy(addr, &out_addr[2]);
+ if (!err)
+ ether_addr_copy(addr, &out_addr[2]);
-out:
kvfree(out);
return err;
}
@@ -197,7 +194,7 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
return err;
}
-EXPORT_SYMBOL(mlx5_modify_nic_vport_mac_address);
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address);
int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
u32 vport,
@@ -430,6 +427,68 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
}
EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans);
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+ u64 *system_image_guid)
+{
+ u32 *out;
+ int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+ out = mlx5_vzalloc(outlen);
+ if (!out)
+ return -ENOMEM;
+
+ mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+ *system_image_guid = MLX5_GET64(query_nic_vport_context_out, out,
+ nic_vport_context.system_image_guid);
+
+ kfree(out);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
+
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
+{
+ u32 *out;
+ int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+ out = mlx5_vzalloc(outlen);
+ if (!out)
+ return -ENOMEM;
+
+ mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+ *node_guid = MLX5_GET64(query_nic_vport_context_out, out,
+ nic_vport_context.node_guid);
+
+ kfree(out);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid);
+
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+ u16 *qkey_viol_cntr)
+{
+ u32 *out;
+ int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+ out = mlx5_vzalloc(outlen);
+ if (!out)
+ return -ENOMEM;
+
+ mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+ *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out,
+ nic_vport_context.qkey_violation_counter);
+
+ kfree(out);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr);
+
int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
u8 port_num, u16 vf_num, u16 gid_index,
union ib_gid *gid)
@@ -750,3 +809,44 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
return err;
}
EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc);
+
+enum mlx5_vport_roce_state {
+ MLX5_VPORT_ROCE_DISABLED = 0,
+ MLX5_VPORT_ROCE_ENABLED = 1,
+};
+
+static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev,
+ enum mlx5_vport_roce_state state)
+{
+ void *in;
+ int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+ int err;
+
+ in = mlx5_vzalloc(inlen);
+ if (!in) {
+ mlx5_core_warn(mdev, "failed to allocate inbox\n");
+ return -ENOMEM;
+ }
+
+ MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1);
+ MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en,
+ state);
+
+ err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+ kvfree(in);
+
+ return err;
+}
+
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev)
+{
+ return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED);
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce);
+
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev)
+{
+ return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED);
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce);
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 23d862d..e2f31c9 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1106,6 +1106,7 @@ config SCSI_IPR
tristate "IBM Power Linux RAID adapter support"
depends on PCI && SCSI && ATA
select FW_LOADER
+ select IRQ_POLL
---help---
This driver supports the IBM Power Linux family RAID adapters.
This includes IBM pSeries 5712, 5703, 5709, and 570A, as well
diff --git a/drivers/scsi/be2iscsi/Kconfig b/drivers/scsi/be2iscsi/Kconfig
index 4e7cad2..bad5f32 100644
--- a/drivers/scsi/be2iscsi/Kconfig
+++ b/drivers/scsi/be2iscsi/Kconfig
@@ -3,6 +3,7 @@ config BE2ISCSI
depends on PCI && SCSI && NET
select SCSI_ISCSI_ATTRS
select ISCSI_BOOT_SYSFS
+ select IRQ_POLL
help
This driver implements the iSCSI functionality for Emulex
diff --git a/drivers/scsi/be2iscsi/be.h b/drivers/scsi/be2iscsi/be.h
index 77f992e..a41c643 100644
--- a/drivers/scsi/be2iscsi/be.h
+++ b/drivers/scsi/be2iscsi/be.h
@@ -20,7 +20,7 @@
#include <linux/pci.h>
#include <linux/if_vlan.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
#define FW_VER_LEN 32
#define MCC_Q_LEN 128
#define MCC_CQ_LEN 256
@@ -101,7 +101,7 @@ struct be_eq_obj {
struct beiscsi_hba *phba;
struct be_queue_info *cq;
struct work_struct work_cqs; /* Work Item */
- struct blk_iopoll iopoll;
+ struct irq_poll iopoll;
};
struct be_mcc_obj {
diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c
index b7087ba..022e87b 100644
--- a/drivers/scsi/be2iscsi/be_iscsi.c
+++ b/drivers/scsi/be2iscsi/be_iscsi.c
@@ -1292,9 +1292,9 @@ static void beiscsi_flush_cq(struct beiscsi_hba *phba)
for (i = 0; i < phba->num_cpus; i++) {
pbe_eq = &phwi_context->be_eq[i];
- blk_iopoll_disable(&pbe_eq->iopoll);
+ irq_poll_disable(&pbe_eq->iopoll);
beiscsi_process_cq(pbe_eq);
- blk_iopoll_enable(&pbe_eq->iopoll);
+ irq_poll_enable(&pbe_eq->iopoll);
}
}
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index fe0c514..cb9072a 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -910,8 +910,7 @@ static irqreturn_t be_isr_msix(int irq, void *dev_id)
num_eq_processed = 0;
while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
& EQE_VALID_MASK) {
- if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
- blk_iopoll_sched(&pbe_eq->iopoll);
+ irq_poll_sched(&pbe_eq->iopoll);
AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
queue_tail_inc(eq);
@@ -972,8 +971,7 @@ static irqreturn_t be_isr(int irq, void *dev_id)
spin_unlock_irqrestore(&phba->isr_lock, flags);
num_mcceq_processed++;
} else {
- if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
- blk_iopoll_sched(&pbe_eq->iopoll);
+ irq_poll_sched(&pbe_eq->iopoll);
num_ioeq_processed++;
}
AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
@@ -2295,7 +2293,7 @@ void beiscsi_process_all_cqs(struct work_struct *work)
hwi_ring_eq_db(phba, pbe_eq->q.id, 0, 0, 1, 1);
}
-static int be_iopoll(struct blk_iopoll *iop, int budget)
+static int be_iopoll(struct irq_poll *iop, int budget)
{
unsigned int ret;
struct beiscsi_hba *phba;
@@ -2306,7 +2304,7 @@ static int be_iopoll(struct blk_iopoll *iop, int budget)
pbe_eq->cq_count += ret;
if (ret < budget) {
phba = pbe_eq->phba;
- blk_iopoll_complete(iop);
+ irq_poll_complete(iop);
beiscsi_log(phba, KERN_INFO,
BEISCSI_LOG_CONFIG | BEISCSI_LOG_IO,
"BM_%d : rearm pbe_eq->q.id =%d\n",
@@ -5293,7 +5291,7 @@ static void beiscsi_quiesce(struct beiscsi_hba *phba,
for (i = 0; i < phba->num_cpus; i++) {
pbe_eq = &phwi_context->be_eq[i];
- blk_iopoll_disable(&pbe_eq->iopoll);
+ irq_poll_disable(&pbe_eq->iopoll);
}
if (unload_state == BEISCSI_CLEAN_UNLOAD) {
@@ -5579,9 +5577,8 @@ static void beiscsi_eeh_resume(struct pci_dev *pdev)
for (i = 0; i < phba->num_cpus; i++) {
pbe_eq = &phwi_context->be_eq[i];
- blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+ irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
be_iopoll);
- blk_iopoll_enable(&pbe_eq->iopoll);
}
i = (phba->msix_enabled) ? i : 0;
@@ -5752,9 +5749,8 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev,
for (i = 0; i < phba->num_cpus; i++) {
pbe_eq = &phwi_context->be_eq[i];
- blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+ irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
be_iopoll);
- blk_iopoll_enable(&pbe_eq->iopoll);
}
i = (phba->msix_enabled) ? i : 0;
@@ -5795,7 +5791,7 @@ free_blkenbld:
destroy_workqueue(phba->wq);
for (i = 0; i < phba->num_cpus; i++) {
pbe_eq = &phwi_context->be_eq[i];
- blk_iopoll_disable(&pbe_eq->iopoll);
+ irq_poll_disable(&pbe_eq->iopoll);
}
free_twq:
beiscsi_clean_port(phba);
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 1c3759b..3b3e099 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -3638,7 +3638,7 @@ static struct device_attribute ipr_ioa_reset_attr = {
.store = ipr_store_reset_adapter
};
-static int ipr_iopoll(struct blk_iopoll *iop, int budget);
+static int ipr_iopoll(struct irq_poll *iop, int budget);
/**
* ipr_show_iopoll_weight - Show ipr polling mode
* @dev: class device struct
@@ -3681,34 +3681,33 @@ static ssize_t ipr_store_iopoll_weight(struct device *dev,
int i;
if (!ioa_cfg->sis64) {
- dev_info(&ioa_cfg->pdev->dev, "blk-iopoll not supported on this adapter\n");
+ dev_info(&ioa_cfg->pdev->dev, "irq_poll not supported on this adapter\n");
return -EINVAL;
}
if (kstrtoul(buf, 10, &user_iopoll_weight))
return -EINVAL;
if (user_iopoll_weight > 256) {
- dev_info(&ioa_cfg->pdev->dev, "Invalid blk-iopoll weight. It must be less than 256\n");
+ dev_info(&ioa_cfg->pdev->dev, "Invalid irq_poll weight. It must be less than 256\n");
return -EINVAL;
}
if (user_iopoll_weight == ioa_cfg->iopoll_weight) {
- dev_info(&ioa_cfg->pdev->dev, "Current blk-iopoll weight has the same weight\n");
+ dev_info(&ioa_cfg->pdev->dev, "Current irq_poll weight has the same weight\n");
return strlen(buf);
}
if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
for (i = 1; i < ioa_cfg->hrrq_num; i++)
- blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+ irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
}
spin_lock_irqsave(shost->host_lock, lock_flags);
ioa_cfg->iopoll_weight = user_iopoll_weight;
if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
for (i = 1; i < ioa_cfg->hrrq_num; i++) {
- blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+ irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
ioa_cfg->iopoll_weight, ipr_iopoll);
- blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
}
}
spin_unlock_irqrestore(shost->host_lock, lock_flags);
@@ -5568,7 +5567,7 @@ static int ipr_process_hrrq(struct ipr_hrr_queue *hrr_queue, int budget,
return num_hrrq;
}
-static int ipr_iopoll(struct blk_iopoll *iop, int budget)
+static int ipr_iopoll(struct irq_poll *iop, int budget)
{
struct ipr_ioa_cfg *ioa_cfg;
struct ipr_hrr_queue *hrrq;
@@ -5584,7 +5583,7 @@ static int ipr_iopoll(struct blk_iopoll *iop, int budget)
completed_ops = ipr_process_hrrq(hrrq, budget, &doneq);
if (completed_ops < budget)
- blk_iopoll_complete(iop);
+ irq_poll_complete(iop);
spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
list_for_each_entry_safe(ipr_cmd, temp, &doneq, queue) {
@@ -5692,8 +5691,7 @@ static irqreturn_t ipr_isr_mhrrq(int irq, void *devp)
if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
if ((be32_to_cpu(*hrrq->hrrq_curr) & IPR_HRRQ_TOGGLE_BIT) ==
hrrq->toggle_bit) {
- if (!blk_iopoll_sched_prep(&hrrq->iopoll))
- blk_iopoll_sched(&hrrq->iopoll);
+ irq_poll_sched(&hrrq->iopoll);
spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
return IRQ_HANDLED;
}
@@ -10404,9 +10402,8 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id)
if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
for (i = 1; i < ioa_cfg->hrrq_num; i++) {
- blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+ irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
ioa_cfg->iopoll_weight, ipr_iopoll);
- blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
}
}
@@ -10435,7 +10432,7 @@ static void ipr_shutdown(struct pci_dev *pdev)
if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
ioa_cfg->iopoll_weight = 0;
for (i = 1; i < ioa_cfg->hrrq_num; i++)
- blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+ irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
}
while (ioa_cfg->in_reset_reload) {
diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h
index a34c7a5..56c5706 100644
--- a/drivers/scsi/ipr.h
+++ b/drivers/scsi/ipr.h
@@ -32,7 +32,7 @@
#include <linux/libata.h>
#include <linux/list.h>
#include <linux/kref.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
@@ -517,7 +517,7 @@ struct ipr_hrr_queue {
u8 allow_cmds:1;
u8 removing_ioa:1;
- struct blk_iopoll iopoll;
+ struct irq_poll iopoll;
};
/* Command packet structure */
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
index 72af486..cb74ae7 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -2070,32 +2070,13 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
{
- struct ib_device_attr *attr;
- int rc;
-
/* It's safe to assume a HCA can handle a page size
* matching that of the native system */
hdev->ibh_page_shift = PAGE_SHIFT;
hdev->ibh_page_size = 1 << PAGE_SHIFT;
hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
- LIBCFS_ALLOC(attr, sizeof(*attr));
- if (attr == NULL) {
- CERROR("Out of memory\n");
- return -ENOMEM;
- }
-
- rc = ib_query_device(hdev->ibh_ibdev, attr);
- if (rc == 0)
- hdev->ibh_mr_size = attr->max_mr_size;
-
- LIBCFS_FREE(attr, sizeof(*attr));
-
- if (rc != 0) {
- CERROR("Failed to query IB device: %d\n", rc);
- return rc;
- }
-
+ hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
if (hdev->ibh_mr_size == ~0ULL) {
hdev->ibh_mr_shift = 64;
return 0;
diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c
index 3ef881f..7ad0c08 100644
--- a/drivers/staging/rdma/amso1100/c2_cq.c
+++ b/drivers/staging/rdma/amso1100/c2_cq.c
@@ -173,9 +173,6 @@ static inline int c2_poll_one(struct c2_dev *c2dev,
case C2_WR_TYPE_RDMA_READ:
entry->opcode = IB_WC_RDMA_READ;
break;
- case C2_WR_TYPE_BIND_MW:
- entry->opcode = IB_WC_BIND_MW;
- break;
case C2_WR_TYPE_RECV:
entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
entry->opcode = IB_WC_RECV;
diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c
index a092ac7..de8d10e 100644
--- a/drivers/staging/rdma/amso1100/c2_provider.c
+++ b/drivers/staging/rdma/amso1100/c2_provider.c
@@ -337,43 +337,21 @@ static inline u32 c2_convert_access(int acc)
C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
}
-static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 * iova_start)
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
{
struct c2_mr *mr;
u64 *page_list;
- u32 total_len;
- int err, i, j, k, page_shift, pbl_depth;
+ const u32 total_len = 0xffffffff; /* AMSO1100 limit */
+ int err, page_shift, pbl_depth, i;
+ u64 kva = 0;
- pbl_depth = 0;
- total_len = 0;
+ pr_debug("%s:%u\n", __func__, __LINE__);
- page_shift = PAGE_SHIFT;
/*
- * If there is only 1 buffer we assume this could
- * be a map of all phy mem...use a 32k page_shift.
+ * This is a map of all phy mem...use a 32k page_shift.
*/
- if (num_phys_buf == 1)
- page_shift += 3;
-
- for (i = 0; i < num_phys_buf; i++) {
-
- if (offset_in_page(buffer_list[i].addr)) {
- pr_debug("Unaligned Memory Buffer: 0x%x\n",
- (unsigned int) buffer_list[i].addr);
- return ERR_PTR(-EINVAL);
- }
-
- if (!buffer_list[i].size) {
- pr_debug("Invalid Buffer Size\n");
- return ERR_PTR(-EINVAL);
- }
-
- total_len += buffer_list[i].size;
- pbl_depth += ALIGN(buffer_list[i].size,
- BIT(page_shift)) >> page_shift;
- }
+ page_shift = PAGE_SHIFT + 3;
+ pbl_depth = ALIGN(total_len, BIT(page_shift)) >> page_shift;
page_list = vmalloc(sizeof(u64) * pbl_depth);
if (!page_list) {
@@ -382,16 +360,8 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
return ERR_PTR(-ENOMEM);
}
- for (i = 0, j = 0; i < num_phys_buf; i++) {
-
- int naddrs;
-
- naddrs = ALIGN(buffer_list[i].size,
- BIT(page_shift)) >> page_shift;
- for (k = 0; k < naddrs; k++)
- page_list[j++] = (buffer_list[i].addr +
- (k << page_shift));
- }
+ for (i = 0; i < pbl_depth; i++)
+ page_list[i] = (i << page_shift);
mr = kmalloc(sizeof(*mr), GFP_KERNEL);
if (!mr) {
@@ -399,17 +369,17 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
return ERR_PTR(-ENOMEM);
}
- mr->pd = to_c2pd(ib_pd);
+ mr->pd = to_c2pd(pd);
mr->umem = NULL;
pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
"*iova_start %llx, first pa %llx, last pa %llx\n",
__func__, page_shift, pbl_depth, total_len,
- (unsigned long long) *iova_start,
+ (unsigned long long) kva,
(unsigned long long) page_list[0],
(unsigned long long) page_list[pbl_depth-1]);
- err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+ err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), page_list,
BIT(page_shift), pbl_depth,
- total_len, 0, iova_start,
+ total_len, 0, &kva,
c2_convert_access(acc), mr);
vfree(page_list);
if (err) {
@@ -420,19 +390,6 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
return &mr->ibmr;
}
-static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
-{
- struct ib_phys_buf bl;
- u64 kva = 0;
-
- pr_debug("%s:%u\n", __func__, __LINE__);
-
- /* AMSO1100 limit */
- bl.size = 0xffffffff;
- bl.addr = 0;
- return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
-}
-
static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
@@ -840,7 +797,6 @@ int c2_register_device(struct c2_dev *dev)
dev->ibdev.destroy_cq = c2_destroy_cq;
dev->ibdev.poll_cq = c2_poll_cq;
dev->ibdev.get_dma_mr = c2_get_dma_mr;
- dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
dev->ibdev.reg_user_mr = c2_reg_user_mr;
dev->ibdev.dereg_mr = c2_dereg_mr;
dev->ibdev.get_port_immutable = c2_port_immutable;
diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h
index bd45e0f..e8c3387 100644
--- a/drivers/staging/rdma/ehca/ehca_classes.h
+++ b/drivers/staging/rdma/ehca/ehca_classes.h
@@ -316,9 +316,8 @@ struct ehca_mr_pginfo {
union {
struct { /* type EHCA_MR_PGI_PHYS section */
- int num_phys_buf;
- struct ib_phys_buf *phys_buf_array;
- u64 next_buf;
+ u64 addr;
+ u16 size;
} phy;
struct { /* type EHCA_MR_PGI_USER section */
struct ib_umem *region;
diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h
index 80e6a3d..cca5933 100644
--- a/drivers/staging/rdma/ehca/ehca_iverbs.h
+++ b/drivers/staging/rdma/ehca/ehca_iverbs.h
@@ -80,30 +80,14 @@ int ehca_destroy_ah(struct ib_ah *ah);
struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags, u64 *iova_start);
-
struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int mr_access_flags,
struct ib_udata *udata);
-int ehca_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf, int mr_access_flags, u64 *iova_start);
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
int ehca_dereg_mr(struct ib_mr *mr);
struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
- struct ib_mw_bind *mw_bind);
-
int ehca_dealloc_mw(struct ib_mw *mw);
struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c
index 860b974..832f22f 100644
--- a/drivers/staging/rdma/ehca/ehca_main.c
+++ b/drivers/staging/rdma/ehca/ehca_main.c
@@ -511,13 +511,9 @@ static int ehca_init_device(struct ehca_shca *shca)
shca->ib_device.req_notify_cq = ehca_req_notify_cq;
/* shca->ib_device.req_ncomp_notif = ehca_req_ncomp_notif; */
shca->ib_device.get_dma_mr = ehca_get_dma_mr;
- shca->ib_device.reg_phys_mr = ehca_reg_phys_mr;
shca->ib_device.reg_user_mr = ehca_reg_user_mr;
- shca->ib_device.query_mr = ehca_query_mr;
shca->ib_device.dereg_mr = ehca_dereg_mr;
- shca->ib_device.rereg_phys_mr = ehca_rereg_phys_mr;
shca->ib_device.alloc_mw = ehca_alloc_mw;
- shca->ib_device.bind_mw = ehca_bind_mw;
shca->ib_device.dealloc_mw = ehca_dealloc_mw;
shca->ib_device.alloc_fmr = ehca_alloc_fmr;
shca->ib_device.map_phys_fmr = ehca_map_phys_fmr;
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c
index 553e883..3367205 100644
--- a/drivers/staging/rdma/ehca/ehca_mrmw.c
+++ b/drivers/staging/rdma/ehca/ehca_mrmw.c
@@ -196,120 +196,6 @@ get_dma_mr_exit0:
/*----------------------------------------------------------------------*/
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_mr *ib_mr;
- int ret;
- struct ehca_mr *e_mr;
- struct ehca_shca *shca =
- container_of(pd->device, struct ehca_shca, ib_device);
- struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-
- u64 size;
-
- if ((num_phys_buf <= 0) || !phys_buf_array) {
- ehca_err(pd->device, "bad input values: num_phys_buf=%x "
- "phys_buf_array=%p", num_phys_buf, phys_buf_array);
- ib_mr = ERR_PTR(-EINVAL);
- goto reg_phys_mr_exit0;
- }
- if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
- !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
- ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
- !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
- /*
- * Remote Write Access requires Local Write Access
- * Remote Atomic Access requires Local Write Access
- */
- ehca_err(pd->device, "bad input values: mr_access_flags=%x",
- mr_access_flags);
- ib_mr = ERR_PTR(-EINVAL);
- goto reg_phys_mr_exit0;
- }
-
- /* check physical buffer list and calculate size */
- ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
- iova_start, &size);
- if (ret) {
- ib_mr = ERR_PTR(ret);
- goto reg_phys_mr_exit0;
- }
- if ((size == 0) ||
- (((u64)iova_start + size) < (u64)iova_start)) {
- ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
- size, iova_start);
- ib_mr = ERR_PTR(-EINVAL);
- goto reg_phys_mr_exit0;
- }
-
- e_mr = ehca_mr_new();
- if (!e_mr) {
- ehca_err(pd->device, "out of memory");
- ib_mr = ERR_PTR(-ENOMEM);
- goto reg_phys_mr_exit0;
- }
-
- /* register MR on HCA */
- if (ehca_mr_is_maxmr(size, iova_start)) {
- e_mr->flags |= EHCA_MR_FLAG_MAXMR;
- ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
- e_pd, &e_mr->ib.ib_mr.lkey,
- &e_mr->ib.ib_mr.rkey);
- if (ret) {
- ib_mr = ERR_PTR(ret);
- goto reg_phys_mr_exit1;
- }
- } else {
- struct ehca_mr_pginfo pginfo;
- u32 num_kpages;
- u32 num_hwpages;
- u64 hw_pgsize;
-
- num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
- PAGE_SIZE);
- /* for kernel space we try most possible pgsize */
- hw_pgsize = ehca_get_max_hwpage_size(shca);
- num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
- hw_pgsize);
- memset(&pginfo, 0, sizeof(pginfo));
- pginfo.type = EHCA_MR_PGI_PHYS;
- pginfo.num_kpages = num_kpages;
- pginfo.hwpage_size = hw_pgsize;
- pginfo.num_hwpages = num_hwpages;
- pginfo.u.phy.num_phys_buf = num_phys_buf;
- pginfo.u.phy.phys_buf_array = phys_buf_array;
- pginfo.next_hwpage =
- ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-
- ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
- e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
- &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
- if (ret) {
- ib_mr = ERR_PTR(ret);
- goto reg_phys_mr_exit1;
- }
- }
-
- /* successful registration of all pages */
- return &e_mr->ib.ib_mr;
-
-reg_phys_mr_exit1:
- ehca_mr_delete(e_mr);
-reg_phys_mr_exit0:
- if (IS_ERR(ib_mr))
- ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
- "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
- PTR_ERR(ib_mr), pd, phys_buf_array,
- num_phys_buf, mr_access_flags, iova_start);
- return ib_mr;
-} /* end ehca_reg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int mr_access_flags,
struct ib_udata *udata)
@@ -437,207 +323,6 @@ reg_user_mr_exit0:
/*----------------------------------------------------------------------*/
-int ehca_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- int ret;
-
- struct ehca_shca *shca =
- container_of(mr->device, struct ehca_shca, ib_device);
- struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
- u64 new_size;
- u64 *new_start;
- u32 new_acl;
- struct ehca_pd *new_pd;
- u32 tmp_lkey, tmp_rkey;
- unsigned long sl_flags;
- u32 num_kpages = 0;
- u32 num_hwpages = 0;
- struct ehca_mr_pginfo pginfo;
-
- if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
- /* TODO not supported, because PHYP rereg hCall needs pages */
- ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
- "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
- ret = -EINVAL;
- goto rereg_phys_mr_exit0;
- }
-
- if (mr_rereg_mask & IB_MR_REREG_PD) {
- if (!pd) {
- ehca_err(mr->device, "rereg with bad pd, pd=%p "
- "mr_rereg_mask=%x", pd, mr_rereg_mask);
- ret = -EINVAL;
- goto rereg_phys_mr_exit0;
- }
- }
-
- if ((mr_rereg_mask &
- ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
- (mr_rereg_mask == 0)) {
- ret = -EINVAL;
- goto rereg_phys_mr_exit0;
- }
-
- /* check other parameters */
- if (e_mr == shca->maxmr) {
- /* should be impossible, however reject to be sure */
- ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
- "shca->maxmr=%p mr->lkey=%x",
- mr, shca->maxmr, mr->lkey);
- ret = -EINVAL;
- goto rereg_phys_mr_exit0;
- }
- if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
- if (e_mr->flags & EHCA_MR_FLAG_FMR) {
- ehca_err(mr->device, "not supported for FMR, mr=%p "
- "flags=%x", mr, e_mr->flags);
- ret = -EINVAL;
- goto rereg_phys_mr_exit0;
- }
- if (!phys_buf_array || num_phys_buf <= 0) {
- ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
- " phys_buf_array=%p num_phys_buf=%x",
- mr_rereg_mask, phys_buf_array, num_phys_buf);
- ret = -EINVAL;
- goto rereg_phys_mr_exit0;
- }
- }
- if ((mr_rereg_mask & IB_MR_REREG_ACCESS) && /* change ACL */
- (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
- !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
- ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
- !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
- /*
- * Remote Write Access requires Local Write Access
- * Remote Atomic Access requires Local Write Access
- */
- ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
- "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
- ret = -EINVAL;
- goto rereg_phys_mr_exit0;
- }
-
- /* set requested values dependent on rereg request */
- spin_lock_irqsave(&e_mr->mrlock, sl_flags);
- new_start = e_mr->start;
- new_size = e_mr->size;
- new_acl = e_mr->acl;
- new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
-
- if (mr_rereg_mask & IB_MR_REREG_TRANS) {
- u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
-
- new_start = iova_start; /* change address */
- /* check physical buffer list and calculate size */
- ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
- num_phys_buf, iova_start,
- &new_size);
- if (ret)
- goto rereg_phys_mr_exit1;
- if ((new_size == 0) ||
- (((u64)iova_start + new_size) < (u64)iova_start)) {
- ehca_err(mr->device, "bad input values: new_size=%llx "
- "iova_start=%p", new_size, iova_start);
- ret = -EINVAL;
- goto rereg_phys_mr_exit1;
- }
- num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
- new_size, PAGE_SIZE);
- num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
- new_size, hw_pgsize);
- memset(&pginfo, 0, sizeof(pginfo));
- pginfo.type = EHCA_MR_PGI_PHYS;
- pginfo.num_kpages = num_kpages;
- pginfo.hwpage_size = hw_pgsize;
- pginfo.num_hwpages = num_hwpages;
- pginfo.u.phy.num_phys_buf = num_phys_buf;
- pginfo.u.phy.phys_buf_array = phys_buf_array;
- pginfo.next_hwpage =
- ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
- }
- if (mr_rereg_mask & IB_MR_REREG_ACCESS)
- new_acl = mr_access_flags;
- if (mr_rereg_mask & IB_MR_REREG_PD)
- new_pd = container_of(pd, struct ehca_pd, ib_pd);
-
- ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
- new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
- if (ret)
- goto rereg_phys_mr_exit1;
-
- /* successful reregistration */
- if (mr_rereg_mask & IB_MR_REREG_PD)
- mr->pd = pd;
- mr->lkey = tmp_lkey;
- mr->rkey = tmp_rkey;
-
-rereg_phys_mr_exit1:
- spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-rereg_phys_mr_exit0:
- if (ret)
- ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
- "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
- "iova_start=%p",
- ret, mr, mr_rereg_mask, pd, phys_buf_array,
- num_phys_buf, mr_access_flags, iova_start);
- return ret;
-} /* end ehca_rereg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
- int ret = 0;
- u64 h_ret;
- struct ehca_shca *shca =
- container_of(mr->device, struct ehca_shca, ib_device);
- struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
- unsigned long sl_flags;
- struct ehca_mr_hipzout_parms hipzout;
-
- if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
- ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
- "e_mr->flags=%x", mr, e_mr, e_mr->flags);
- ret = -EINVAL;
- goto query_mr_exit0;
- }
-
- memset(mr_attr, 0, sizeof(struct ib_mr_attr));
- spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-
- h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
- if (h_ret != H_SUCCESS) {
- ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
- "hca_hndl=%llx mr_hndl=%llx lkey=%x",
- h_ret, mr, shca->ipz_hca_handle.handle,
- e_mr->ipz_mr_handle.handle, mr->lkey);
- ret = ehca2ib_return_code(h_ret);
- goto query_mr_exit1;
- }
- mr_attr->pd = mr->pd;
- mr_attr->device_virt_addr = hipzout.vaddr;
- mr_attr->size = hipzout.len;
- mr_attr->lkey = hipzout.lkey;
- mr_attr->rkey = hipzout.rkey;
- ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
-
-query_mr_exit1:
- spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-query_mr_exit0:
- if (ret)
- ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
- ret, mr, mr_attr);
- return ret;
-} /* end ehca_query_mr() */
-
-/*----------------------------------------------------------------------*/
-
int ehca_dereg_mr(struct ib_mr *mr)
{
int ret = 0;
@@ -728,18 +413,6 @@ alloc_mw_exit0:
/*----------------------------------------------------------------------*/
-int ehca_bind_mw(struct ib_qp *qp,
- struct ib_mw *mw,
- struct ib_mw_bind *mw_bind)
-{
- /* TODO: not supported up to now */
- ehca_gen_err("bind MW currently not supported by HCAD");
-
- return -EPERM;
-} /* end ehca_bind_mw() */
-
-/*----------------------------------------------------------------------*/
-
int ehca_dealloc_mw(struct ib_mw *mw)
{
u64 h_ret;
@@ -1616,7 +1289,6 @@ int ehca_reg_internal_maxmr(
u64 *iova_start;
u64 size_maxmr;
struct ehca_mr_pginfo pginfo;
- struct ib_phys_buf ib_pbuf;
u32 num_kpages;
u32 num_hwpages;
u64 hw_pgsize;
@@ -1637,8 +1309,6 @@ int ehca_reg_internal_maxmr(
/* register internal max-MR on HCA */
size_maxmr = ehca_mr_len;
iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
- ib_pbuf.addr = 0;
- ib_pbuf.size = size_maxmr;
num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
PAGE_SIZE);
hw_pgsize = ehca_get_max_hwpage_size(shca);
@@ -1650,8 +1320,8 @@ int ehca_reg_internal_maxmr(
pginfo.num_kpages = num_kpages;
pginfo.num_hwpages = num_hwpages;
pginfo.hwpage_size = hw_pgsize;
- pginfo.u.phy.num_phys_buf = 1;
- pginfo.u.phy.phys_buf_array = &ib_pbuf;
+ pginfo.u.phy.addr = 0;
+ pginfo.u.phy.size = size_maxmr;
ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
&pginfo, &e_mr->ib.ib_mr.lkey,
@@ -1669,7 +1339,6 @@ int ehca_reg_internal_maxmr(
e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
e_mr->ib.ib_mr.uobject = NULL;
atomic_inc(&(e_pd->ib_pd.usecnt));
- atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
*e_maxmr = e_mr;
return 0;
@@ -1762,61 +1431,6 @@ ehca_dereg_internal_maxmr_exit0:
/*----------------------------------------------------------------------*/
-/*
- * check physical buffer array of MR verbs for validness and
- * calculates MR size
- */
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- u64 *iova_start,
- u64 *size)
-{
- struct ib_phys_buf *pbuf = phys_buf_array;
- u64 size_count = 0;
- u32 i;
-
- if (num_phys_buf == 0) {
- ehca_gen_err("bad phys buf array len, num_phys_buf=0");
- return -EINVAL;
- }
- /* check first buffer */
- if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
- ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
- "pbuf->addr=%llx pbuf->size=%llx",
- iova_start, pbuf->addr, pbuf->size);
- return -EINVAL;
- }
- if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
- (num_phys_buf > 1)) {
- ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
- "pbuf->size=%llx", pbuf->addr, pbuf->size);
- return -EINVAL;
- }
-
- for (i = 0; i < num_phys_buf; i++) {
- if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
- ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
- "pbuf->size=%llx",
- i, pbuf->addr, pbuf->size);
- return -EINVAL;
- }
- if (((i > 0) && /* not 1st */
- (i < (num_phys_buf - 1)) && /* not last */
- (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
- ehca_gen_err("bad size, i=%x pbuf->size=%llx",
- i, pbuf->size);
- return -EINVAL;
- }
- size_count += pbuf->size;
- pbuf++;
- }
-
- *size = size_count;
- return 0;
-} /* end ehca_mr_chk_buf_and_calc_size() */
-
-/*----------------------------------------------------------------------*/
-
/* check page list of map FMR verb for validness */
int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
u64 *page_list,
@@ -2002,57 +1616,54 @@ static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
u32 number, u64 *kpage)
{
int ret = 0;
- struct ib_phys_buf *pbuf;
+ u64 addr = pginfo->u.phy.addr;
+ u64 size = pginfo->u.phy.size;
u64 num_hw, offs_hw;
u32 i = 0;
- /* loop over desired phys_buf_array entries */
- while (i < number) {
- pbuf = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
- num_hw = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
- pbuf->size, pginfo->hwpage_size);
- offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
- pginfo->hwpage_size;
- while (pginfo->next_hwpage < offs_hw + num_hw) {
- /* sanity check */
- if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
- (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
- ehca_gen_err("kpage_cnt >= num_kpages, "
- "kpage_cnt=%llx num_kpages=%llx "
- "hwpage_cnt=%llx "
- "num_hwpages=%llx i=%x",
- pginfo->kpage_cnt,
- pginfo->num_kpages,
- pginfo->hwpage_cnt,
- pginfo->num_hwpages, i);
- return -EFAULT;
- }
- *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
- (pginfo->next_hwpage * pginfo->hwpage_size);
- if ( !(*kpage) && pbuf->addr ) {
- ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
- "next_hwpage=%llx", pbuf->addr,
- pbuf->size, pginfo->next_hwpage);
- return -EFAULT;
- }
- (pginfo->hwpage_cnt)++;
- (pginfo->next_hwpage)++;
- if (PAGE_SIZE >= pginfo->hwpage_size) {
- if (pginfo->next_hwpage %
- (PAGE_SIZE / pginfo->hwpage_size) == 0)
- (pginfo->kpage_cnt)++;
- } else
- pginfo->kpage_cnt += pginfo->hwpage_size /
- PAGE_SIZE;
- kpage++;
- i++;
- if (i >= number) break;
+ num_hw = NUM_CHUNKS((addr % pginfo->hwpage_size) + size,
+ pginfo->hwpage_size);
+ offs_hw = (addr & ~(pginfo->hwpage_size - 1)) / pginfo->hwpage_size;
+
+ while (pginfo->next_hwpage < offs_hw + num_hw) {
+ /* sanity check */
+ if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+ (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+ ehca_gen_err("kpage_cnt >= num_kpages, "
+ "kpage_cnt=%llx num_kpages=%llx "
+ "hwpage_cnt=%llx "
+ "num_hwpages=%llx i=%x",
+ pginfo->kpage_cnt,
+ pginfo->num_kpages,
+ pginfo->hwpage_cnt,
+ pginfo->num_hwpages, i);
+ return -EFAULT;
}
- if (pginfo->next_hwpage >= offs_hw + num_hw) {
- (pginfo->u.phy.next_buf)++;
- pginfo->next_hwpage = 0;
+ *kpage = (addr & ~(pginfo->hwpage_size - 1)) +
+ (pginfo->next_hwpage * pginfo->hwpage_size);
+ if ( !(*kpage) && addr ) {
+ ehca_gen_err("addr=%llx size=%llx "
+ "next_hwpage=%llx", addr,
+ size, pginfo->next_hwpage);
+ return -EFAULT;
}
+ (pginfo->hwpage_cnt)++;
+ (pginfo->next_hwpage)++;
+ if (PAGE_SIZE >= pginfo->hwpage_size) {
+ if (pginfo->next_hwpage %
+ (PAGE_SIZE / pginfo->hwpage_size) == 0)
+ (pginfo->kpage_cnt)++;
+ } else
+ pginfo->kpage_cnt += pginfo->hwpage_size /
+ PAGE_SIZE;
+ kpage++;
+ i++;
+ if (i >= number) break;
}
+ if (pginfo->next_hwpage >= offs_hw + num_hw) {
+ pginfo->next_hwpage = 0;
+ }
+
return ret;
}
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h
index 50d8b51..52bfa95 100644
--- a/drivers/staging/rdma/ehca/ehca_mrmw.h
+++ b/drivers/staging/rdma/ehca/ehca_mrmw.h
@@ -98,11 +98,6 @@ int ehca_reg_maxmr(struct ehca_shca *shca,
int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- u64 *iova_start,
- u64 *size);
-
int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
u64 *page_list,
int list_len);
diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c
index 10e2074..11813b8 100644
--- a/drivers/staging/rdma/ehca/ehca_reqs.c
+++ b/drivers/staging/rdma/ehca/ehca_reqs.c
@@ -614,7 +614,6 @@ int ehca_post_srq_recv(struct ib_srq *srq,
static const u8 ib_wc_opcode[255] = {
[0x01] = IB_WC_RECV+1,
[0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
- [0x04] = IB_WC_BIND_MW+1,
[0x08] = IB_WC_FETCH_ADD+1,
[0x10] = IB_WC_COMP_SWAP+1,
[0x20] = IB_WC_RDMA_WRITE+1,
diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c
index 568f185..a3f8b88 100644
--- a/drivers/staging/rdma/hfi1/mr.c
+++ b/drivers/staging/rdma/hfi1/mr.c
@@ -167,10 +167,7 @@ static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd)
rval = init_mregion(&mr->mr, pd, count);
if (rval)
goto bail;
- /*
- * ib_reg_phys_mr() will initialize mr->ibmr except for
- * lkey and rkey.
- */
+
rval = hfi1_alloc_lkey(&mr->mr, 0);
if (rval)
goto bail_mregion;
@@ -188,52 +185,6 @@ bail:
}
/**
- * hfi1_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start)
-{
- struct hfi1_mr *mr;
- int n, m, i;
- struct ib_mr *ret;
-
- mr = alloc_mr(num_phys_buf, pd);
- if (IS_ERR(mr)) {
- ret = (struct ib_mr *)mr;
- goto bail;
- }
-
- mr->mr.user_base = *iova_start;
- mr->mr.iova = *iova_start;
- mr->mr.access_flags = acc;
-
- m = 0;
- n = 0;
- for (i = 0; i < num_phys_buf; i++) {
- mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
- mr->mr.map[m]->segs[n].length = buffer_list[i].size;
- mr->mr.length += buffer_list[i].size;
- n++;
- if (n == HFI1_SEGSZ) {
- m++;
- n = 0;
- }
- }
-
- ret = &mr->ibmr;
-
-bail:
- return ret;
-}
-
-/**
* hfi1_reg_user_mr - register a userspace memory region
* @pd: protection domain for this memory region
* @start: starting userspace address
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index ef0feaa..09b8d41 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -2052,7 +2052,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
ibdev->poll_cq = hfi1_poll_cq;
ibdev->req_notify_cq = hfi1_req_notify_cq;
ibdev->get_dma_mr = hfi1_get_dma_mr;
- ibdev->reg_phys_mr = hfi1_reg_phys_mr;
ibdev->reg_user_mr = hfi1_reg_user_mr;
ibdev->dereg_mr = hfi1_dereg_mr;
ibdev->alloc_mr = hfi1_alloc_mr;
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index 72106e5..286e468 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -1024,10 +1024,6 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc);
-struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start);
-
struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c
index c7278f6..b76b0ce 100644
--- a/drivers/staging/rdma/ipath/ipath_mr.c
+++ b/drivers/staging/rdma/ipath/ipath_mr.c
@@ -98,10 +98,6 @@ static struct ipath_mr *alloc_mr(int count,
}
mr->mr.mapsz = m;
- /*
- * ib_reg_phys_mr() will initialize mr->ibmr except for
- * lkey and rkey.
- */
if (!ipath_alloc_lkey(lk_table, &mr->mr))
goto bail;
mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
@@ -121,57 +117,6 @@ done:
}
/**
- * ipath_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start)
-{
- struct ipath_mr *mr;
- int n, m, i;
- struct ib_mr *ret;
-
- mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
- if (mr == NULL) {
- ret = ERR_PTR(-ENOMEM);
- goto bail;
- }
-
- mr->mr.pd = pd;
- mr->mr.user_base = *iova_start;
- mr->mr.iova = *iova_start;
- mr->mr.length = 0;
- mr->mr.offset = 0;
- mr->mr.access_flags = acc;
- mr->mr.max_segs = num_phys_buf;
- mr->umem = NULL;
-
- m = 0;
- n = 0;
- for (i = 0; i < num_phys_buf; i++) {
- mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
- mr->mr.map[m]->segs[n].length = buffer_list[i].size;
- mr->mr.length += buffer_list[i].size;
- n++;
- if (n == IPATH_SEGSZ) {
- m++;
- n = 0;
- }
- }
-
- ret = &mr->ibmr;
-
-bail:
- return ret;
-}
-
-/**
* ipath_reg_user_mr - register a userspace memory region
* @pd: protection domain for this memory region
* @start: starting userspace address
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
index 1778dee..53f9dca 100644
--- a/drivers/staging/rdma/ipath/ipath_verbs.c
+++ b/drivers/staging/rdma/ipath/ipath_verbs.c
@@ -2201,7 +2201,6 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
dev->poll_cq = ipath_poll_cq;
dev->req_notify_cq = ipath_req_notify_cq;
dev->get_dma_mr = ipath_get_dma_mr;
- dev->reg_phys_mr = ipath_reg_phys_mr;
dev->reg_user_mr = ipath_reg_user_mr;
dev->dereg_mr = ipath_dereg_mr;
dev->alloc_fmr = ipath_alloc_fmr;
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h
index 0a90a56..6c70a89 100644
--- a/drivers/staging/rdma/ipath/ipath_verbs.h
+++ b/drivers/staging/rdma/ipath/ipath_verbs.h
@@ -828,10 +828,6 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *buffer_list,
- int num_phys_buf, int acc, u64 *iova_start);
-
struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
deleted file mode 100644
index 77ae77c..0000000
--- a/include/linux/blk-iopoll.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef BLK_IOPOLL_H
-#define BLK_IOPOLL_H
-
-struct blk_iopoll;
-typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
-
-struct blk_iopoll {
- struct list_head list;
- unsigned long state;
- unsigned long data;
- int weight;
- int max;
- blk_iopoll_fn *poll;
-};
-
-enum {
- IOPOLL_F_SCHED = 0,
- IOPOLL_F_DISABLE = 1,
-};
-
-/*
- * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating
- * that we were the first to acquire this iop for scheduling. If this iop
- * is currently disabled, return "failure".
- */
-static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
-{
- if (!test_bit(IOPOLL_F_DISABLE, &iop->state))
- return test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
-
- return 1;
-}
-
-static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
-{
- return test_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-
-extern void blk_iopoll_sched(struct blk_iopoll *);
-extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
-extern void blk_iopoll_complete(struct blk_iopoll *);
-extern void __blk_iopoll_complete(struct blk_iopoll *);
-extern void blk_iopoll_enable(struct blk_iopoll *);
-extern void blk_iopoll_disable(struct blk_iopoll *);
-
-#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index cb30edb..0e95fcc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -413,7 +413,7 @@ enum
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
- BLOCK_IOPOLL_SOFTIRQ,
+ IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h
new file mode 100644
index 0000000..3e8c1b8
--- /dev/null
+++ b/include/linux/irq_poll.h
@@ -0,0 +1,25 @@
+#ifndef IRQ_POLL_H
+#define IRQ_POLL_H
+
+struct irq_poll;
+typedef int (irq_poll_fn)(struct irq_poll *, int);
+
+struct irq_poll {
+ struct list_head list;
+ unsigned long state;
+ int weight;
+ irq_poll_fn *poll;
+};
+
+enum {
+ IRQ_POLL_F_SCHED = 0,
+ IRQ_POLL_F_DISABLE = 1,
+};
+
+extern void irq_poll_sched(struct irq_poll *);
+extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *);
+extern void irq_poll_complete(struct irq_poll *);
+extern void irq_poll_enable(struct irq_poll *);
+extern void irq_poll_disable(struct irq_poll *);
+
+#endif
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 58391f2..116b284 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -206,7 +206,8 @@ enum {
MLX4_SET_PORT_GID_TABLE = 0x5,
MLX4_SET_PORT_PRIO2TC = 0x8,
MLX4_SET_PORT_SCHEDULER = 0x9,
- MLX4_SET_PORT_VXLAN = 0xB
+ MLX4_SET_PORT_VXLAN = 0xB,
+ MLX4_SET_PORT_ROCE_ADDR = 0xD
};
enum {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d3133be..430a929 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -216,6 +216,7 @@ enum {
MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN = 1LL << 30,
MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31,
MLX4_DEV_CAP_FLAG2_LB_SRC_CHK = 1ULL << 32,
+ MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33,
};
enum {
@@ -267,12 +268,14 @@ enum {
MLX4_BMME_FLAG_TYPE_2_WIN = 1 << 9,
MLX4_BMME_FLAG_RESERVED_LKEY = 1 << 10,
MLX4_BMME_FLAG_FAST_REG_WR = 1 << 11,
+ MLX4_BMME_FLAG_ROCE_V1_V2 = 1 << 19,
MLX4_BMME_FLAG_PORT_REMAP = 1 << 24,
MLX4_BMME_FLAG_VSD_INIT2RTR = 1 << 28,
};
enum {
- MLX4_FLAG_PORT_REMAP = MLX4_BMME_FLAG_PORT_REMAP
+ MLX4_FLAG_PORT_REMAP = MLX4_BMME_FLAG_PORT_REMAP,
+ MLX4_FLAG_ROCE_V1_V2 = MLX4_BMME_FLAG_ROCE_V1_V2
};
enum mlx4_event {
@@ -979,14 +982,11 @@ struct mlx4_mad_ifc {
for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
if ((type) == (dev)->caps.port_mask[(port)])
-#define mlx4_foreach_non_ib_transport_port(port, dev) \
- for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
- if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
-
#define mlx4_foreach_ib_transport_port(port, dev) \
- for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
+ for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
- ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+ ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \
+ ((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2))
#define MLX4_INVALID_SLAVE_ID 0xFF
#define MLX4_SINK_COUNTER_INDEX(dev) (dev->caps.max_counters - 1)
@@ -1457,6 +1457,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port);
int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index fe052e2..587cdf9 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -194,7 +194,7 @@ struct mlx4_qp_context {
u8 mtu_msgmax;
u8 rq_size_stride;
u8 sq_size_stride;
- u8 rlkey;
+ u8 rlkey_roce_mode;
__be32 usr_page;
__be32 local_qpn;
__be32 remote_qpn;
@@ -204,7 +204,8 @@ struct mlx4_qp_context {
u32 reserved1;
__be32 next_send_psn;
__be32 cqn_send;
- u32 reserved2[2];
+ __be16 roce_entropy;
+ __be16 reserved2[3];
__be32 last_acked_psn;
__be32 ssn;
__be32 params2;
@@ -487,4 +488,14 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
+static inline u16 folded_qp(u32 q)
+{
+ u16 res;
+
+ res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00);
+ return res;
+}
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
+
#endif /* MLX4_QP_H */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 7be845e..987764a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -223,6 +223,14 @@ enum {
#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1)
#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
+#define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
+
+enum {
+ MLX5_EVENT_QUEUE_TYPE_QP = 0,
+ MLX5_EVENT_QUEUE_TYPE_RQ = 1,
+ MLX5_EVENT_QUEUE_TYPE_SQ = 2,
+};
+
enum mlx5_event {
MLX5_EVENT_TYPE_COMP = 0x0,
@@ -280,6 +288,26 @@ enum {
};
enum {
+ MLX5_ROCE_VERSION_1 = 0,
+ MLX5_ROCE_VERSION_2 = 2,
+};
+
+enum {
+ MLX5_ROCE_VERSION_1_CAP = 1 << MLX5_ROCE_VERSION_1,
+ MLX5_ROCE_VERSION_2_CAP = 1 << MLX5_ROCE_VERSION_2,
+};
+
+enum {
+ MLX5_ROCE_L3_TYPE_IPV4 = 0,
+ MLX5_ROCE_L3_TYPE_IPV6 = 1,
+};
+
+enum {
+ MLX5_ROCE_L3_TYPE_IPV4_CAP = 1 << 1,
+ MLX5_ROCE_L3_TYPE_IPV6_CAP = 1 << 2,
+};
+
+enum {
MLX5_OPCODE_NOP = 0x00,
MLX5_OPCODE_SEND_INVAL = 0x01,
MLX5_OPCODE_RDMA_WRITE = 0x08,
@@ -446,7 +474,7 @@ struct mlx5_init_seg {
__be32 rsvd2[880];
__be32 internal_timer_h;
__be32 internal_timer_l;
- __be32 rsrv3[2];
+ __be32 rsvd3[2];
__be32 health_counter;
__be32 rsvd4[1019];
__be64 ieee1588_clk;
@@ -460,7 +488,9 @@ struct mlx5_eqe_comp {
};
struct mlx5_eqe_qp_srq {
- __be32 reserved[6];
+ __be32 reserved1[5];
+ u8 type;
+ u8 reserved2[3];
__be32 qp_srq_n;
};
@@ -651,6 +681,12 @@ enum {
};
enum {
+ MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH = 0x0,
+ MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6 = 0x1,
+ MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4 = 0x2,
+};
+
+enum {
CQE_L2_OK = 1 << 0,
CQE_L3_OK = 1 << 1,
CQE_L4_OK = 1 << 2,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5162f35..1e3006d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -115,6 +115,11 @@ enum {
MLX5_REG_HOST_ENDIANNESS = 0x7004,
};
+enum {
+ MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0,
+ MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1,
+};
+
enum mlx5_page_fault_resume_flags {
MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1,
@@ -341,9 +346,11 @@ struct mlx5_core_mr {
};
enum mlx5_res_type {
- MLX5_RES_QP,
- MLX5_RES_SRQ,
- MLX5_RES_XSRQ,
+ MLX5_RES_QP = MLX5_EVENT_QUEUE_TYPE_QP,
+ MLX5_RES_RQ = MLX5_EVENT_QUEUE_TYPE_RQ,
+ MLX5_RES_SQ = MLX5_EVENT_QUEUE_TYPE_SQ,
+ MLX5_RES_SRQ = 3,
+ MLX5_RES_XSRQ = 4,
};
struct mlx5_core_rsc_common {
@@ -651,13 +658,6 @@ extern struct workqueue_struct *mlx5_core_wq;
.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
.struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field
-struct ib_field {
- size_t struct_offset_bytes;
- size_t struct_size_bytes;
- int offset_bits;
- int size_bits;
-};
-
static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev)
{
return pci_get_drvdata(pdev);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 68d73f8..231ab6b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -67,6 +67,11 @@ enum {
};
enum {
+ MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0,
+ MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3,
+};
+
+enum {
MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
MLX5_CMD_OP_QUERY_ADAPTER = 0x101,
MLX5_CMD_OP_INIT_HCA = 0x102,
@@ -573,21 +578,24 @@ enum {
struct mlx5_ifc_atomic_caps_bits {
u8 reserved_0[0x40];
- u8 atomic_req_endianness[0x1];
- u8 reserved_1[0x1f];
+ u8 atomic_req_8B_endianess_mode[0x2];
+ u8 reserved_1[0x4];
+ u8 supported_atomic_req_8B_endianess_mode_1[0x1];
- u8 reserved_2[0x20];
+ u8 reserved_2[0x19];
- u8 reserved_3[0x10];
- u8 atomic_operations[0x10];
+ u8 reserved_3[0x20];
u8 reserved_4[0x10];
- u8 atomic_size_qp[0x10];
+ u8 atomic_operations[0x10];
u8 reserved_5[0x10];
+ u8 atomic_size_qp[0x10];
+
+ u8 reserved_6[0x10];
u8 atomic_size_dc[0x10];
- u8 reserved_6[0x720];
+ u8 reserved_7[0x720];
};
struct mlx5_ifc_odp_cap_bits {
@@ -850,7 +858,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_66[0x8];
u8 log_uar_page_sz[0x10];
- u8 reserved_67[0x40];
+ u8 reserved_67[0x20];
+ u8 device_frequency_mhz[0x20];
u8 device_frequency_khz[0x20];
u8 reserved_68[0x5f];
u8 cqe_zip[0x1];
@@ -2215,19 +2224,25 @@ struct mlx5_ifc_nic_vport_context_bits {
u8 mtu[0x10];
- u8 reserved_3[0x640];
+ u8 system_image_guid[0x40];
+ u8 port_guid[0x40];
+ u8 node_guid[0x40];
+
+ u8 reserved_3[0x140];
+ u8 qkey_violation_counter[0x10];
+ u8 reserved_4[0x430];
u8 promisc_uc[0x1];
u8 promisc_mc[0x1];
u8 promisc_all[0x1];
- u8 reserved_4[0x2];
+ u8 reserved_5[0x2];
u8 allowed_list_type[0x3];
- u8 reserved_5[0xc];
+ u8 reserved_6[0xc];
u8 allowed_list_size[0xc];
struct mlx5_ifc_mac_address_layout_bits permanent_address;
- u8 reserved_6[0x20];
+ u8 reserved_7[0x20];
u8 current_uc_mac_address[0][0x40];
};
@@ -4199,6 +4214,13 @@ struct mlx5_ifc_modify_tis_out_bits {
u8 reserved_1[0x40];
};
+struct mlx5_ifc_modify_tis_bitmask_bits {
+ u8 reserved_0[0x20];
+
+ u8 reserved_1[0x1f];
+ u8 prio[0x1];
+};
+
struct mlx5_ifc_modify_tis_in_bits {
u8 opcode[0x10];
u8 reserved_0[0x10];
@@ -4211,7 +4233,7 @@ struct mlx5_ifc_modify_tis_in_bits {
u8 reserved_3[0x20];
- u8 modify_bitmask[0x40];
+ struct mlx5_ifc_modify_tis_bitmask_bits bitmask;
u8 reserved_4[0x40];
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index f079fb1..5b8c89f 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -85,7 +85,16 @@ enum mlx5_qp_state {
MLX5_QP_STATE_ERR = 6,
MLX5_QP_STATE_SQ_DRAINING = 7,
MLX5_QP_STATE_SUSPENDED = 9,
- MLX5_QP_NUM_STATE
+ MLX5_QP_NUM_STATE,
+ MLX5_QP_STATE,
+ MLX5_QP_STATE_BAD,
+};
+
+enum {
+ MLX5_SQ_STATE_NA = MLX5_SQC_STATE_ERR + 1,
+ MLX5_SQ_NUM_STATE = MLX5_SQ_STATE_NA + 1,
+ MLX5_RQ_STATE_NA = MLX5_RQC_STATE_ERR + 1,
+ MLX5_RQ_NUM_STATE = MLX5_RQ_STATE_NA + 1,
};
enum {
@@ -130,6 +139,9 @@ enum {
MLX5_QP_BIT_RWE = 1 << 14,
MLX5_QP_BIT_RAE = 1 << 13,
MLX5_QP_BIT_RIC = 1 << 4,
+ MLX5_QP_BIT_CC_SLAVE_RECV = 1 << 2,
+ MLX5_QP_BIT_CC_SLAVE_SEND = 1 << 1,
+ MLX5_QP_BIT_CC_MASTER = 1 << 0
};
enum {
@@ -248,8 +260,12 @@ struct mlx5_av {
__be32 dqp_dct;
u8 stat_rate_sl;
u8 fl_mlid;
- __be16 rlid;
- u8 reserved0[10];
+ union {
+ __be16 rlid;
+ __be16 udp_sport;
+ };
+ u8 reserved0[4];
+ u8 rmac[6];
u8 tclass;
u8 hop_limit;
__be32 grh_gid_fl;
@@ -456,11 +472,16 @@ struct mlx5_qp_path {
u8 static_rate;
u8 hop_limit;
__be32 tclass_flowlabel;
- u8 rgid[16];
- u8 rsvd1[4];
- u8 sl;
+ union {
+ u8 rgid[16];
+ u8 rip[16];
+ };
+ u8 f_dscp_ecn_prio;
+ u8 ecn_dscp;
+ __be16 udp_sport;
+ u8 dci_cfi_prio_sl;
u8 port;
- u8 rsvd2[6];
+ u8 rmac[6];
};
struct mlx5_qp_context {
@@ -620,8 +641,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
struct mlx5_core_qp *qp,
struct mlx5_create_qp_mbox_in *in,
int inlen);
-int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
- enum mlx5_qp_state new_state,
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation,
struct mlx5_modify_qp_mbox_in *in, int sqd_event,
struct mlx5_core_qp *qp);
int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
@@ -639,6 +659,14 @@ void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
u8 context, int error);
#endif
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+ struct mlx5_core_qp *rq);
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+ struct mlx5_core_qp *rq);
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+ struct mlx5_core_qp *sq);
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+ struct mlx5_core_qp *sq);
static inline const char *mlx5_qp_type_str(int type)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/include/linux/mlx5/transobj.h
index 74cae51..88441f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
+++ b/include/linux/mlx5/transobj.h
@@ -33,16 +33,20 @@
#ifndef __TRANSOBJ_H__
#define __TRANSOBJ_H__
-int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn);
-void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn);
+#include <linux/mlx5/driver.h>
+
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn);
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn);
int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *rqn);
int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen);
void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn);
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out);
int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *sqn);
int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out);
int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *tirn);
int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
@@ -50,6 +54,8 @@ int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *tisn);
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+ int inlen);
void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn);
int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
u32 *rmpn);
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 638f2ca..1237710 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -45,6 +45,11 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
u16 vport, u8 *addr);
int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
u16 vport, u8 *addr);
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+ u64 *system_image_guid);
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+ u16 *qkey_viol_cntr);
int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
u8 port_num, u16 vf_num, u16 gid_index,
union ib_gid *gid);
@@ -85,4 +90,7 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
u16 vlans[],
int list_size);
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev);
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev);
+
#endif /* __MLX5_VPORT_H__ */
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f869807..5322fea 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -51,6 +51,7 @@
/* RPC/RDMA parameters and stats */
extern unsigned int svcrdma_ord;
extern unsigned int svcrdma_max_requests;
+extern unsigned int svcrdma_max_bc_requests;
extern unsigned int svcrdma_max_req_size;
extern atomic_t rdma_stat_recv;
@@ -69,6 +70,7 @@ extern atomic_t rdma_stat_sq_prod;
* completes.
*/
struct svc_rdma_op_ctxt {
+ struct list_head free;
struct svc_rdma_op_ctxt *read_hdr;
struct svc_rdma_fastreg_mr *frmr;
int hdr_count;
@@ -112,6 +114,7 @@ struct svc_rdma_fastreg_mr {
struct list_head frmr_list;
};
struct svc_rdma_req_map {
+ struct list_head free;
unsigned long count;
union {
struct kvec sge[RPCSVC_MAXPAGES];
@@ -132,28 +135,32 @@ struct svcxprt_rdma {
int sc_max_sge;
int sc_max_sge_rd; /* max sge for read target */
- int sc_sq_depth; /* Depth of SQ */
atomic_t sc_sq_count; /* Number of SQ WR on queue */
-
- int sc_max_requests; /* Depth of RQ */
+ unsigned int sc_sq_depth; /* Depth of SQ */
+ unsigned int sc_rq_depth; /* Depth of RQ */
+ u32 sc_max_requests; /* Forward credits */
+ u32 sc_max_bc_requests;/* Backward credits */
int sc_max_req_size; /* Size of each RQ WR buf */
struct ib_pd *sc_pd;
atomic_t sc_dma_used;
- atomic_t sc_ctxt_used;
+ spinlock_t sc_ctxt_lock;
+ struct list_head sc_ctxts;
+ int sc_ctxt_used;
+ spinlock_t sc_map_lock;
+ struct list_head sc_maps;
+
struct list_head sc_rq_dto_q;
spinlock_t sc_rq_dto_lock;
struct ib_qp *sc_qp;
struct ib_cq *sc_rq_cq;
struct ib_cq *sc_sq_cq;
- struct ib_mr *sc_phys_mr; /* MR for server memory */
int (*sc_reader)(struct svcxprt_rdma *,
struct svc_rqst *,
struct svc_rdma_op_ctxt *,
int *, u32 *, u32, u32, u64, bool);
u32 sc_dev_caps; /* distilled device caps */
- u32 sc_dma_lkey; /* local dma key */
unsigned int sc_frmr_pg_list_len;
struct list_head sc_frmr_q;
spinlock_t sc_frmr_q_lock;
@@ -179,8 +186,18 @@ struct svcxprt_rdma {
#define RPCRDMA_MAX_REQUESTS 32
#define RPCRDMA_MAX_REQ_SIZE 4096
+/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
+ * current NFSv4.1 implementation supports one backchannel slot.
+ */
+#define RPCRDMA_MAX_BC_REQUESTS 2
+
#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
+/* svc_rdma_backchannel.c */
+extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
+ struct rpcrdma_msg *rmsgp,
+ struct xdr_buf *rcvbuf);
+
/* svc_rdma_marshal.c */
extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
@@ -206,6 +223,8 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
u32, u32, u64, bool);
/* svc_rdma_sendto.c */
+extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
+ struct svc_rdma_req_map *);
extern int svc_rdma_sendto(struct svc_rqst *);
extern struct rpcrdma_read_chunk *
svc_rdma_get_read_chunk(struct rpcrdma_msg *);
@@ -214,13 +233,14 @@ extern struct rpcrdma_read_chunk *
extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
enum rpcrdma_errcode);
-extern int svc_rdma_post_recv(struct svcxprt_rdma *);
+extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t);
extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
-extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
-extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
+extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
+extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
+ struct svc_rdma_req_map *);
extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
struct svc_rdma_fastreg_mr *);
@@ -234,6 +254,7 @@ extern struct svc_xprt_class svc_rdma_bc_class;
#endif
/* svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
extern int svc_rdma_init(void);
extern void svc_rdma_cleanup(void);
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index 1152859..c34c900 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -83,6 +83,8 @@ struct rdma_dev_addr {
int bound_dev_if;
enum rdma_transport_type transport;
struct net *net;
+ enum rdma_network_type network;
+ int hoplimit;
};
/**
@@ -91,8 +93,8 @@ struct rdma_dev_addr {
*
* The dev_addr->net field must be initialized.
*/
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
- u16 *vlan_id);
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr, u16 *vlan_id);
/**
* rdma_resolve_ip - Resolve source and destination IP addresses to
@@ -117,6 +119,10 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
struct rdma_dev_addr *addr, void *context),
void *context);
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr);
+
void rdma_addr_cancel(struct rdma_dev_addr *addr);
int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
@@ -125,8 +131,10 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
int rdma_addr_size(struct sockaddr *addr);
int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
-int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid,
- u8 *smac, u16 *vlan_id, int if_index);
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *smac, u16 *vlan_id, int *if_index,
+ int *hoplimit);
static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
{
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index 269a27cf..e30f19b 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -60,6 +60,7 @@ int ib_get_cached_gid(struct ib_device *device,
* a specified GID value occurs.
* @device: The device to query.
* @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
* @ndev: In RoCE, the net device of the device. NULL means ignore.
* @port_num: The port number of the device where the GID value was found.
* @index: The index into the cached GID table where the GID was found. This
@@ -70,6 +71,7 @@ int ib_get_cached_gid(struct ib_device *device,
*/
int ib_find_cached_gid(struct ib_device *device,
const union ib_gid *gid,
+ enum ib_gid_type gid_type,
struct net_device *ndev,
u8 *port_num,
u16 *index);
@@ -79,6 +81,7 @@ int ib_find_cached_gid(struct ib_device *device,
* GID value occurs
* @device: The device to query.
* @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
* @port_num: The port number of the device where the GID value sould be
* searched.
* @ndev: In RoCE, the net device of the device. Null means ignore.
@@ -90,6 +93,7 @@ int ib_find_cached_gid(struct ib_device *device,
*/
int ib_find_cached_gid_by_port(struct ib_device *device,
const union ib_gid *gid,
+ enum ib_gid_type gid_type,
u8 port_num,
struct net_device *ndev,
u16 *index);
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index ec9b44d..0ff049b 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -438,6 +438,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
/**
* ib_mad_recv_handler - callback handler for a received MAD.
* @mad_agent: MAD agent requesting the received MAD.
+ * @send_buf: Send buffer if found, else NULL
* @mad_recv_wc: Received work completion information on the received MAD.
*
* MADs received in response to a send request operation will be handed to
@@ -447,6 +448,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
* modify the data referenced by @mad_recv_wc.
*/
typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc);
/**
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index e99d8f9..0f3daae 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -41,6 +41,8 @@ enum {
IB_ETH_BYTES = 14,
IB_VLAN_BYTES = 4,
IB_GRH_BYTES = 40,
+ IB_IP4_BYTES = 20,
+ IB_UDP_BYTES = 8,
IB_BTH_BYTES = 12,
IB_DETH_BYTES = 8
};
@@ -223,6 +225,27 @@ struct ib_unpacked_eth {
__be16 type;
};
+struct ib_unpacked_ip4 {
+ u8 ver;
+ u8 hdr_len;
+ u8 tos;
+ __be16 tot_len;
+ __be16 id;
+ __be16 frag_off;
+ u8 ttl;
+ u8 protocol;
+ __sum16 check;
+ __be32 saddr;
+ __be32 daddr;
+};
+
+struct ib_unpacked_udp {
+ __be16 sport;
+ __be16 dport;
+ __be16 length;
+ __be16 csum;
+};
+
struct ib_unpacked_vlan {
__be16 tag;
__be16 type;
@@ -237,6 +260,10 @@ struct ib_ud_header {
struct ib_unpacked_vlan vlan;
int grh_present;
struct ib_unpacked_grh grh;
+ int ipv4_present;
+ struct ib_unpacked_ip4 ip4;
+ int udp_present;
+ struct ib_unpacked_udp udp;
struct ib_unpacked_bth bth;
struct ib_unpacked_deth deth;
int immediate_present;
@@ -253,13 +280,17 @@ void ib_unpack(const struct ib_field *desc,
void *buf,
void *structure);
-void ib_ud_header_init(int payload_bytes,
- int lrh_present,
- int eth_present,
- int vlan_present,
- int grh_present,
- int immediate_present,
- struct ib_ud_header *header);
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header);
+
+int ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int ip_version,
+ int udp_present,
+ int immediate_present,
+ struct ib_ud_header *header);
int ib_ud_header_pack(struct ib_ud_header *header,
void *buf);
diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h
index a5889f1..2f8a65c 100644
--- a/include/rdma/ib_pma.h
+++ b/include/rdma/ib_pma.h
@@ -42,6 +42,7 @@
*/
#define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8)
#define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10)
#define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12)
#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001)
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 3019695..cdc1c81 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -160,6 +160,7 @@ struct ib_sa_path_rec {
int ifindex;
/* ignored in IB */
struct net *net;
+ enum ib_gid_type gid_type;
};
static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec)
@@ -402,6 +403,8 @@ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
*/
int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
struct ib_sa_mcmember_rec *rec,
+ struct net_device *ndev,
+ enum ib_gid_type gid_type,
struct ib_ah_attr *ah_attr);
/**
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 120da1d..284b00c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -49,13 +49,19 @@
#include <linux/scatterlist.h>
#include <linux/workqueue.h>
#include <linux/socket.h>
+#include <linux/irq_poll.h>
#include <uapi/linux/if_ether.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <linux/string.h>
+#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
union ib_gid {
u8 raw[16];
@@ -67,7 +73,17 @@ union ib_gid {
extern union ib_gid zgid;
+enum ib_gid_type {
+ /* If link layer is Ethernet, this is RoCE V1 */
+ IB_GID_TYPE_IB = 0,
+ IB_GID_TYPE_ROCE = 0,
+ IB_GID_TYPE_ROCE_UDP_ENCAP = 1,
+ IB_GID_TYPE_SIZE
+};
+
+#define ROCE_V2_UDP_DPORT 4791
struct ib_gid_attr {
+ enum ib_gid_type gid_type;
struct net_device *ndev;
};
@@ -98,6 +114,35 @@ enum rdma_protocol_type {
__attribute_const__ enum rdma_transport_type
rdma_node_get_transport(enum rdma_node_type node_type);
+enum rdma_network_type {
+ RDMA_NETWORK_IB,
+ RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB,
+ RDMA_NETWORK_IPV4,
+ RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+ if (network_type == RDMA_NETWORK_IPV4 ||
+ network_type == RDMA_NETWORK_IPV6)
+ return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+ /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */
+ return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type,
+ union ib_gid *gid)
+{
+ if (gid_type == IB_GID_TYPE_IB)
+ return RDMA_NETWORK_IB;
+
+ if (ipv6_addr_v4mapped((struct in6_addr *)gid))
+ return RDMA_NETWORK_IPV4;
+ else
+ return RDMA_NETWORK_IPV6;
+}
+
enum rdma_link_layer {
IB_LINK_LAYER_UNSPECIFIED,
IB_LINK_LAYER_INFINIBAND,
@@ -105,24 +150,32 @@ enum rdma_link_layer {
};
enum ib_device_cap_flags {
- IB_DEVICE_RESIZE_MAX_WR = 1,
- IB_DEVICE_BAD_PKEY_CNTR = (1<<1),
- IB_DEVICE_BAD_QKEY_CNTR = (1<<2),
- IB_DEVICE_RAW_MULTI = (1<<3),
- IB_DEVICE_AUTO_PATH_MIG = (1<<4),
- IB_DEVICE_CHANGE_PHY_PORT = (1<<5),
- IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6),
- IB_DEVICE_CURR_QP_STATE_MOD = (1<<7),
- IB_DEVICE_SHUTDOWN_PORT = (1<<8),
- IB_DEVICE_INIT_TYPE = (1<<9),
- IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10),
- IB_DEVICE_SYS_IMAGE_GUID = (1<<11),
- IB_DEVICE_RC_RNR_NAK_GEN = (1<<12),
- IB_DEVICE_SRQ_RESIZE = (1<<13),
- IB_DEVICE_N_NOTIFY_CQ = (1<<14),
- IB_DEVICE_LOCAL_DMA_LKEY = (1<<15),
- IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */
- IB_DEVICE_MEM_WINDOW = (1<<17),
+ IB_DEVICE_RESIZE_MAX_WR = (1 << 0),
+ IB_DEVICE_BAD_PKEY_CNTR = (1 << 1),
+ IB_DEVICE_BAD_QKEY_CNTR = (1 << 2),
+ IB_DEVICE_RAW_MULTI = (1 << 3),
+ IB_DEVICE_AUTO_PATH_MIG = (1 << 4),
+ IB_DEVICE_CHANGE_PHY_PORT = (1 << 5),
+ IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6),
+ IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7),
+ IB_DEVICE_SHUTDOWN_PORT = (1 << 8),
+ IB_DEVICE_INIT_TYPE = (1 << 9),
+ IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10),
+ IB_DEVICE_SYS_IMAGE_GUID = (1 << 11),
+ IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12),
+ IB_DEVICE_SRQ_RESIZE = (1 << 13),
+ IB_DEVICE_N_NOTIFY_CQ = (1 << 14),
+
+ /*
+ * This device supports a per-device lkey or stag that can be
+ * used without performing a memory registration for the local
+ * memory. Note that ULPs should never check this flag, but
+ * instead of use the local_dma_lkey flag in the ib_pd structure,
+ * which will always contain a usable lkey.
+ */
+ IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15),
+ IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16),
+ IB_DEVICE_MEM_WINDOW = (1 << 17),
/*
* Devices should set IB_DEVICE_UD_IP_SUM if they support
* insertion of UDP and TCP checksum on outgoing UD IPoIB
@@ -130,18 +183,35 @@ enum ib_device_cap_flags {
* incoming messages. Setting this flag implies that the
* IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
*/
- IB_DEVICE_UD_IP_CSUM = (1<<18),
- IB_DEVICE_UD_TSO = (1<<19),
- IB_DEVICE_XRC = (1<<20),
- IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21),
- IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
- IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23),
- IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24),
- IB_DEVICE_RC_IP_CSUM = (1<<25),
- IB_DEVICE_RAW_IP_CSUM = (1<<26),
- IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
- IB_DEVICE_SIGNATURE_HANDOVER = (1<<30),
- IB_DEVICE_ON_DEMAND_PAGING = (1<<31),
+ IB_DEVICE_UD_IP_CSUM = (1 << 18),
+ IB_DEVICE_UD_TSO = (1 << 19),
+ IB_DEVICE_XRC = (1 << 20),
+
+ /*
+ * This device supports the IB "base memory management extension",
+ * which includes support for fast registrations (IB_WR_REG_MR,
+ * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should
+ * also be set by any iWarp device which must support FRs to comply
+ * to the iWarp verbs spec. iWarp devices also support the
+ * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
+ * stag.
+ */
+ IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21),
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22),
+ IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23),
+ IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24),
+ IB_DEVICE_RC_IP_CSUM = (1 << 25),
+ IB_DEVICE_RAW_IP_CSUM = (1 << 26),
+ /*
+ * Devices should set IB_DEVICE_CROSS_CHANNEL if they
+ * support execution of WQEs that involve synchronization
+ * of I/O operations with single completion queue managed
+ * by hardware.
+ */
+ IB_DEVICE_CROSS_CHANNEL = (1 << 27),
+ IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29),
+ IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30),
+ IB_DEVICE_ON_DEMAND_PAGING = (1 << 31),
};
enum ib_signature_prot_cap {
@@ -184,6 +254,7 @@ struct ib_odp_caps {
enum ib_cq_creation_flags {
IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0,
+ IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1,
};
struct ib_cq_init_attr {
@@ -393,6 +464,7 @@ union rdma_protocol_stats {
#define RDMA_CORE_CAP_PROT_IB 0x00100000
#define RDMA_CORE_CAP_PROT_ROCE 0x00200000
#define RDMA_CORE_CAP_PROT_IWARP 0x00400000
+#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
#define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \
| RDMA_CORE_CAP_IB_MAD \
@@ -405,6 +477,12 @@ union rdma_protocol_stats {
| RDMA_CORE_CAP_IB_CM \
| RDMA_CORE_CAP_AF_IB \
| RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \
+ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
+ | RDMA_CORE_CAP_IB_MAD \
+ | RDMA_CORE_CAP_IB_CM \
+ | RDMA_CORE_CAP_AF_IB \
+ | RDMA_CORE_CAP_ETH_AH)
#define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \
| RDMA_CORE_CAP_IW_CM)
#define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \
@@ -519,6 +597,17 @@ struct ib_grh {
union ib_gid dgid;
};
+union rdma_network_hdr {
+ struct ib_grh ibgrh;
+ struct {
+ /* The IB spec states that if it's IPv4, the header
+ * is located in the last 20 bytes of the header.
+ */
+ u8 reserved[20];
+ struct iphdr roce4grh;
+ };
+};
+
enum {
IB_MULTICAST_QPN = 0xffffff
};
@@ -734,7 +823,6 @@ enum ib_wc_opcode {
IB_WC_RDMA_READ,
IB_WC_COMP_SWAP,
IB_WC_FETCH_ADD,
- IB_WC_BIND_MW,
IB_WC_LSO,
IB_WC_LOCAL_INV,
IB_WC_REG_MR,
@@ -755,10 +843,14 @@ enum ib_wc_flags {
IB_WC_IP_CSUM_OK = (1<<3),
IB_WC_WITH_SMAC = (1<<4),
IB_WC_WITH_VLAN = (1<<5),
+ IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6),
};
struct ib_wc {
- u64 wr_id;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
enum ib_wc_status status;
enum ib_wc_opcode opcode;
u32 vendor_err;
@@ -777,6 +869,7 @@ struct ib_wc {
u8 port_num; /* valid only for DR SMPs on switches */
u8 smac[ETH_ALEN];
u16 vlan_id;
+ u8 network_hdr_type;
};
enum ib_cq_notify_flags {
@@ -866,6 +959,9 @@ enum ib_qp_type {
enum ib_qp_create_flags {
IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0,
IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
+ IB_QP_CREATE_CROSS_CHANNEL = 1 << 2,
+ IB_QP_CREATE_MANAGED_SEND = 1 << 3,
+ IB_QP_CREATE_MANAGED_RECV = 1 << 4,
IB_QP_CREATE_NETIF_QP = 1 << 5,
IB_QP_CREATE_SIGNATURE_EN = 1 << 6,
IB_QP_CREATE_USE_GFP_NOIO = 1 << 7,
@@ -1027,7 +1123,6 @@ enum ib_wr_opcode {
IB_WR_REG_MR,
IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
- IB_WR_BIND_MW,
IB_WR_REG_SIG_MR,
/* reserve values for low level drivers' internal use.
* These values will not be used at all in the ib core layer.
@@ -1062,26 +1157,16 @@ struct ib_sge {
u32 lkey;
};
-/**
- * struct ib_mw_bind_info - Parameters for a memory window bind operation.
- * @mr: A memory region to bind the memory window to.
- * @addr: The address where the memory window should begin.
- * @length: The length of the memory window, in bytes.
- * @mw_access_flags: Access flags from enum ib_access_flags for the window.
- *
- * This struct contains the shared parameters for type 1 and type 2
- * memory window bind operations.
- */
-struct ib_mw_bind_info {
- struct ib_mr *mr;
- u64 addr;
- u64 length;
- int mw_access_flags;
+struct ib_cqe {
+ void (*done)(struct ib_cq *cq, struct ib_wc *wc);
};
struct ib_send_wr {
struct ib_send_wr *next;
- u64 wr_id;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
struct ib_sge *sg_list;
int num_sge;
enum ib_wr_opcode opcode;
@@ -1147,19 +1232,6 @@ static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr)
return container_of(wr, struct ib_reg_wr, wr);
}
-struct ib_bind_mw_wr {
- struct ib_send_wr wr;
- struct ib_mw *mw;
- /* The new rkey for the memory window. */
- u32 rkey;
- struct ib_mw_bind_info bind_info;
-};
-
-static inline struct ib_bind_mw_wr *bind_mw_wr(struct ib_send_wr *wr)
-{
- return container_of(wr, struct ib_bind_mw_wr, wr);
-}
-
struct ib_sig_handover_wr {
struct ib_send_wr wr;
struct ib_sig_attrs *sig_attrs;
@@ -1175,7 +1247,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
struct ib_recv_wr {
struct ib_recv_wr *next;
- u64 wr_id;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
struct ib_sge *sg_list;
int num_sge;
};
@@ -1190,20 +1265,10 @@ enum ib_access_flags {
IB_ACCESS_ON_DEMAND = (1<<6),
};
-struct ib_phys_buf {
- u64 addr;
- u64 size;
-};
-
-struct ib_mr_attr {
- struct ib_pd *pd;
- u64 device_virt_addr;
- u64 size;
- int mr_access_flags;
- u32 lkey;
- u32 rkey;
-};
-
+/*
+ * XXX: these are apparently used for ->rereg_user_mr, no idea why they
+ * are hidden here instead of a uapi header!
+ */
enum ib_mr_rereg_flags {
IB_MR_REREG_TRANS = 1,
IB_MR_REREG_PD = (1<<1),
@@ -1211,18 +1276,6 @@ enum ib_mr_rereg_flags {
IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1)
};
-/**
- * struct ib_mw_bind - Parameters for a type 1 memory window bind operation.
- * @wr_id: Work request id.
- * @send_flags: Flags from ib_send_flags enum.
- * @bind_info: More parameters of the bind operation.
- */
-struct ib_mw_bind {
- u64 wr_id;
- int send_flags;
- struct ib_mw_bind_info bind_info;
-};
-
struct ib_fmr_attr {
int max_pages;
int max_maps;
@@ -1307,6 +1360,12 @@ struct ib_ah {
typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+enum ib_poll_context {
+ IB_POLL_DIRECT, /* caller context, no hw completions */
+ IB_POLL_SOFTIRQ, /* poll from softirq context */
+ IB_POLL_WORKQUEUE, /* poll from workqueue */
+};
+
struct ib_cq {
struct ib_device *device;
struct ib_uobject *uobject;
@@ -1315,6 +1374,12 @@ struct ib_cq {
void *cq_context;
int cqe;
atomic_t usecnt; /* count number of work queues */
+ enum ib_poll_context poll_ctx;
+ struct ib_wc *wc;
+ union {
+ struct irq_poll iop;
+ struct work_struct work;
+ };
};
struct ib_srq {
@@ -1363,7 +1428,6 @@ struct ib_mr {
u64 iova;
u32 length;
unsigned int page_size;
- atomic_t usecnt; /* count number of MWs */
};
struct ib_mw {
@@ -1724,11 +1788,6 @@ struct ib_device {
int wc_cnt);
struct ib_mr * (*get_dma_mr)(struct ib_pd *pd,
int mr_access_flags);
- struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
struct ib_mr * (*reg_user_mr)(struct ib_pd *pd,
u64 start, u64 length,
u64 virt_addr,
@@ -1741,8 +1800,6 @@ struct ib_device {
int mr_access_flags,
struct ib_pd *pd,
struct ib_udata *udata);
- int (*query_mr)(struct ib_mr *mr,
- struct ib_mr_attr *mr_attr);
int (*dereg_mr)(struct ib_mr *mr);
struct ib_mr * (*alloc_mr)(struct ib_pd *pd,
enum ib_mr_type mr_type,
@@ -1750,18 +1807,8 @@ struct ib_device {
int (*map_mr_sg)(struct ib_mr *mr,
struct scatterlist *sg,
int sg_nents);
- int (*rereg_phys_mr)(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
struct ib_mw * (*alloc_mw)(struct ib_pd *pd,
enum ib_mw_type type);
- int (*bind_mw)(struct ib_qp *qp,
- struct ib_mw *mw,
- struct ib_mw_bind *mw_bind);
int (*dealloc_mw)(struct ib_mw *mw);
struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd,
int mr_access_flags,
@@ -1823,6 +1870,7 @@ struct ib_device {
u16 is_switch:1;
u8 node_type;
u8 phys_port_cnt;
+ struct ib_device_attr attrs;
/**
* The following mandatory functions are used only at device
@@ -1888,6 +1936,31 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len
return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
}
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+ size_t offset,
+ size_t len)
+{
+ const void __user *p = udata->inbuf + offset;
+ bool ret = false;
+ u8 *buf;
+
+ if (len > USHRT_MAX)
+ return false;
+
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return false;
+
+ if (copy_from_user(buf, p, len))
+ goto free;
+
+ ret = !memchr_inv(buf, 0, len);
+
+free:
+ kfree(buf);
+ return ret;
+}
+
/**
* ib_modify_qp_is_ok - Check that the supplied attribute mask
* contains all required attributes and no attributes not allowed for
@@ -1912,9 +1985,6 @@ int ib_register_event_handler (struct ib_event_handler *event_handler);
int ib_unregister_event_handler(struct ib_event_handler *event_handler);
void ib_dispatch_event(struct ib_event *event);
-int ib_query_device(struct ib_device *device,
- struct ib_device_attr *device_attr);
-
int ib_query_port(struct ib_device *device,
u8 port_num, struct ib_port_attr *port_attr);
@@ -1968,6 +2038,17 @@ static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
{
+ return device->port_immutable[port_num].core_cap_flags &
+ (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+}
+
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+}
+
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
+{
return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE;
}
@@ -1978,8 +2059,8 @@ static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_n
static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
{
- return device->port_immutable[port_num].core_cap_flags &
- (RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_PROT_ROCE);
+ return rdma_protocol_ib(device, port_num) ||
+ rdma_protocol_roce(device, port_num);
}
/**
@@ -2220,7 +2301,8 @@ int ib_modify_port(struct ib_device *device,
struct ib_port_modify *port_modify);
int ib_find_gid(struct ib_device *device, union ib_gid *gid,
- struct net_device *ndev, u8 *port_num, u16 *index);
+ enum ib_gid_type gid_type, struct net_device *ndev,
+ u8 *port_num, u16 *index);
int ib_find_pkey(struct ib_device *device,
u8 port_num, u16 pkey, u16 *index);
@@ -2454,6 +2536,11 @@ static inline int ib_post_recv(struct ib_qp *qp,
return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
}
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+ int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx);
+void ib_free_cq(struct ib_cq *cq);
+int ib_process_cq_direct(struct ib_cq *cq, int budget);
+
/**
* ib_create_cq - Creates a CQ on the specified device.
* @device: The device on which to create the CQ.
@@ -2839,13 +2926,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
}
/**
- * ib_query_mr - Retrieves information about a specific memory region.
- * @mr: The memory region to retrieve information about.
- * @mr_attr: The attributes of the specified memory region.
- */
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
-/**
* ib_dereg_mr - Deregisters a memory region and removes it from the
* HCA translation table.
* @mr: The memory region to deregister.
@@ -2882,42 +2962,6 @@ static inline u32 ib_inc_rkey(u32 rkey)
}
/**
- * ib_alloc_mw - Allocates a memory window.
- * @pd: The protection domain associated with the memory window.
- * @type: The type of the memory window (1 or 2).
- */
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-
-/**
- * ib_bind_mw - Posts a work request to the send queue of the specified
- * QP, which binds the memory window to the given address range and
- * remote access attributes.
- * @qp: QP to post the bind work request on.
- * @mw: The memory window to bind.
- * @mw_bind: Specifies information about the memory window, including
- * its address range, remote access rights, and associated memory region.
- *
- * If there is no immediate error, the function will update the rkey member
- * of the mw parameter to its new value. The bind operation can still fail
- * asynchronously.
- */
-static inline int ib_bind_mw(struct ib_qp *qp,
- struct ib_mw *mw,
- struct ib_mw_bind *mw_bind)
-{
- /* XXX reference counting in corresponding MR? */
- return mw->device->bind_mw ?
- mw->device->bind_mw(qp, mw, mw_bind) :
- -ENOSYS;
-}
-
-/**
- * ib_dealloc_mw - Deallocates a memory window.
- * @mw: The memory window to deallocate.
- */
-int ib_dealloc_mw(struct ib_mw *mw);
-
-/**
* ib_alloc_fmr - Allocates a unmapped fast memory region.
* @pd: The protection domain associated with the unmapped region.
* @mr_access_flags: Specifies the memory access rights.
diff --git a/include/scsi/iser.h b/include/scsi/iser.h
new file mode 100644
index 0000000..2e678fa
--- /dev/null
+++ b/include/scsi/iser.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ISCSI_ISER_H
+#define ISCSI_ISER_H
+
+#define ISER_ZBVA_NOT_SUP 0x80
+#define ISER_SEND_W_INV_NOT_SUP 0x40
+#define ISERT_ZBVA_NOT_USED 0x80
+#define ISERT_SEND_W_INV_NOT_USED 0x40
+
+#define ISCSI_CTRL 0x10
+#define ISER_HELLO 0x20
+#define ISER_HELLORPLY 0x30
+
+#define ISER_VER 0x10
+#define ISER_WSV 0x08
+#define ISER_RSV 0x04
+
+/**
+ * struct iser_cm_hdr - iSER CM header (from iSER Annex A12)
+ *
+ * @flags: flags support (zbva, send_w_inv)
+ * @rsvd: reserved
+ */
+struct iser_cm_hdr {
+ u8 flags;
+ u8 rsvd[3];
+} __packed;
+
+/**
+ * struct iser_ctrl - iSER header of iSCSI control PDU
+ *
+ * @flags: opcode and read/write valid bits
+ * @rsvd: reserved
+ * @write_stag: write rkey
+ * @write_va: write virtual address
+ * @reaf_stag: read rkey
+ * @read_va: read virtual address
+ */
+struct iser_ctrl {
+ u8 flags;
+ u8 rsvd[3];
+ __be32 write_stag;
+ __be64 write_va;
+ __be32 read_stag;
+ __be64 read_va;
+} __packed;
+
+#endif /* ISCSI_ISER_H */
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index ff8f6c0..f95f25e 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -15,7 +15,7 @@ struct softirq_action;
softirq_name(NET_TX) \
softirq_name(NET_RX) \
softirq_name(BLOCK) \
- softirq_name(BLOCK_IOPOLL) \
+ softirq_name(IRQ_POLL) \
softirq_name(TASKLET) \
softirq_name(SCHED) \
softirq_name(HRTIMER) \
diff --git a/lib/Kconfig b/lib/Kconfig
index 435f731..133ebc0 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -477,6 +477,11 @@ config DDR
information. This data is useful for drivers handling
DDR SDRAM controllers.
+config IRQ_POLL
+ bool "IRQ polling library"
+ help
+ Helper library to poll interrupt mitigation using polling.
+
config MPILIB
tristate
select CLZ_TAB
diff --git a/lib/Makefile b/lib/Makefile
index 2d4bc33..a7c26a4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -165,6 +165,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
obj-$(CONFIG_SG_SPLIT) += sg_split.o
obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
+obj-$(CONFIG_IRQ_POLL) += irq_poll.o
libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
fdt_empty_tree.o
diff --git a/block/blk-iopoll.c b/lib/irq_poll.c
index 0736729..836f7db 100644
--- a/block/blk-iopoll.c
+++ b/lib/irq_poll.c
@@ -6,84 +6,84 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
-#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
#include <linux/delay.h>
-#include "blk.h"
-
-static unsigned int blk_iopoll_budget __read_mostly = 256;
+static unsigned int irq_poll_budget __read_mostly = 256;
static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
/**
- * blk_iopoll_sched - Schedule a run of the iopoll handler
+ * irq_poll_sched - Schedule a run of the iopoll handler
* @iop: The parent iopoll structure
*
* Description:
- * Add this blk_iopoll structure to the pending poll list and trigger the
- * raise of the blk iopoll softirq. The driver must already have gotten a
- * successful return from blk_iopoll_sched_prep() before calling this.
+ * Add this irq_poll structure to the pending poll list and trigger the
+ * raise of the blk iopoll softirq.
**/
-void blk_iopoll_sched(struct blk_iopoll *iop)
+void irq_poll_sched(struct irq_poll *iop)
{
unsigned long flags;
+ if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
+ return;
+ if (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state))
+ return;
+
local_irq_save(flags);
list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
local_irq_restore(flags);
}
-EXPORT_SYMBOL(blk_iopoll_sched);
+EXPORT_SYMBOL(irq_poll_sched);
/**
- * __blk_iopoll_complete - Mark this @iop as un-polled again
+ * __irq_poll_complete - Mark this @iop as un-polled again
* @iop: The parent iopoll structure
*
* Description:
- * See blk_iopoll_complete(). This function must be called with interrupts
+ * See irq_poll_complete(). This function must be called with interrupts
* disabled.
**/
-void __blk_iopoll_complete(struct blk_iopoll *iop)
+static void __irq_poll_complete(struct irq_poll *iop)
{
list_del(&iop->list);
smp_mb__before_atomic();
- clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+ clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
}
-EXPORT_SYMBOL(__blk_iopoll_complete);
/**
- * blk_iopoll_complete - Mark this @iop as un-polled again
+ * irq_poll_complete - Mark this @iop as un-polled again
* @iop: The parent iopoll structure
*
* Description:
* If a driver consumes less than the assigned budget in its run of the
* iopoll handler, it'll end the polled mode by calling this function. The
- * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
+ * iopoll handler will not be invoked again before irq_poll_sched()
* is called.
**/
-void blk_iopoll_complete(struct blk_iopoll *iop)
+void irq_poll_complete(struct irq_poll *iop)
{
unsigned long flags;
local_irq_save(flags);
- __blk_iopoll_complete(iop);
+ __irq_poll_complete(iop);
local_irq_restore(flags);
}
-EXPORT_SYMBOL(blk_iopoll_complete);
+EXPORT_SYMBOL(irq_poll_complete);
-static void blk_iopoll_softirq(struct softirq_action *h)
+static void irq_poll_softirq(struct softirq_action *h)
{
struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
- int rearm = 0, budget = blk_iopoll_budget;
+ int rearm = 0, budget = irq_poll_budget;
unsigned long start_time = jiffies;
local_irq_disable();
while (!list_empty(list)) {
- struct blk_iopoll *iop;
+ struct irq_poll *iop;
int work, weight;
/*
@@ -101,11 +101,11 @@ static void blk_iopoll_softirq(struct softirq_action *h)
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
- iop = list_entry(list->next, struct blk_iopoll, list);
+ iop = list_entry(list->next, struct irq_poll, list);
weight = iop->weight;
work = 0;
- if (test_bit(IOPOLL_F_SCHED, &iop->state))
+ if (test_bit(IRQ_POLL_F_SCHED, &iop->state))
work = iop->poll(iop, weight);
budget -= work;
@@ -121,72 +121,70 @@ static void blk_iopoll_softirq(struct softirq_action *h)
* move the instance around on the list at-will.
*/
if (work >= weight) {
- if (blk_iopoll_disable_pending(iop))
- __blk_iopoll_complete(iop);
+ if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
+ __irq_poll_complete(iop);
else
list_move_tail(&iop->list, list);
}
}
if (rearm)
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
local_irq_enable();
}
/**
- * blk_iopoll_disable - Disable iopoll on this @iop
+ * irq_poll_disable - Disable iopoll on this @iop
* @iop: The parent iopoll structure
*
* Description:
* Disable io polling and wait for any pending callbacks to have completed.
**/
-void blk_iopoll_disable(struct blk_iopoll *iop)
+void irq_poll_disable(struct irq_poll *iop)
{
- set_bit(IOPOLL_F_DISABLE, &iop->state);
- while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
+ set_bit(IRQ_POLL_F_DISABLE, &iop->state);
+ while (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state))
msleep(1);
- clear_bit(IOPOLL_F_DISABLE, &iop->state);
+ clear_bit(IRQ_POLL_F_DISABLE, &iop->state);
}
-EXPORT_SYMBOL(blk_iopoll_disable);
+EXPORT_SYMBOL(irq_poll_disable);
/**
- * blk_iopoll_enable - Enable iopoll on this @iop
+ * irq_poll_enable - Enable iopoll on this @iop
* @iop: The parent iopoll structure
*
* Description:
* Enable iopoll on this @iop. Note that the handler run will not be
* scheduled, it will only mark it as active.
**/
-void blk_iopoll_enable(struct blk_iopoll *iop)
+void irq_poll_enable(struct irq_poll *iop)
{
- BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
+ BUG_ON(!test_bit(IRQ_POLL_F_SCHED, &iop->state));
smp_mb__before_atomic();
- clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+ clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
}
-EXPORT_SYMBOL(blk_iopoll_enable);
+EXPORT_SYMBOL(irq_poll_enable);
/**
- * blk_iopoll_init - Initialize this @iop
+ * irq_poll_init - Initialize this @iop
* @iop: The parent iopoll structure
* @weight: The default weight (or command completion budget)
* @poll_fn: The handler to invoke
*
* Description:
- * Initialize this blk_iopoll structure. Before being actively used, the
- * driver must call blk_iopoll_enable().
+ * Initialize and enable this irq_poll structure.
**/
-void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
+void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn)
{
memset(iop, 0, sizeof(*iop));
INIT_LIST_HEAD(&iop->list);
iop->weight = weight;
iop->poll = poll_fn;
- set_bit(IOPOLL_F_SCHED, &iop->state);
}
-EXPORT_SYMBOL(blk_iopoll_init);
+EXPORT_SYMBOL(irq_poll_init);
-static int blk_iopoll_cpu_notify(struct notifier_block *self,
+static int irq_poll_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
/*
@@ -199,26 +197,26 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
this_cpu_ptr(&blk_cpu_iopoll));
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
local_irq_enable();
}
return NOTIFY_OK;
}
-static struct notifier_block blk_iopoll_cpu_notifier = {
- .notifier_call = blk_iopoll_cpu_notify,
+static struct notifier_block irq_poll_cpu_notifier = {
+ .notifier_call = irq_poll_cpu_notify,
};
-static __init int blk_iopoll_setup(void)
+static __init int irq_poll_setup(void)
{
int i;
for_each_possible_cpu(i)
INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
- open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
- register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
+ open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq);
+ register_hotcpu_notifier(&irq_poll_cpu_notifier);
return 0;
}
-subsys_initcall(blk_iopoll_setup);
+subsys_initcall(irq_poll_setup);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index f222885..9481d55 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
static void rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
- struct ib_device_attr *dev_attr;
/* Only handle IB (no iWARP) devices */
if (device->node_type != RDMA_NODE_IB_CA)
return;
- dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
- if (!dev_attr)
- return;
-
- if (ib_query_device(device, dev_attr)) {
- rdsdebug("Query device failed for %s\n", device->name);
- goto free_attr;
- }
-
rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
ibdev_to_node(device));
if (!rds_ibdev)
- goto free_attr;
+ return;
spin_lock_init(&rds_ibdev->spinlock);
atomic_set(&rds_ibdev->refcount, 1);
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
- rds_ibdev->max_wrs = dev_attr->max_qp_wr;
- rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+ rds_ibdev->max_wrs = device->attrs.max_qp_wr;
+ rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
- rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
- rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
- min_t(unsigned int, (dev_attr->max_mr / 2),
+ rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
+ rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
+ min_t(unsigned int, (device->attrs.max_mr / 2),
rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
- rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
- min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
+ rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
+ min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
- rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
- rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+ rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
+ rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device);
@@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device)
}
rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
- dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+ device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
rds_ibdev->max_8k_fmrs);
@@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device)
put_dev:
rds_ib_dev_put(rds_ibdev);
-free_attr:
- kfree(dev_attr);
}
/*
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 576f182..f4a9fff 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns);
static void rds_iw_add_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
- struct ib_device_attr *dev_attr;
/* Only handle iwarp devices */
if (device->node_type != RDMA_NODE_RNIC)
return;
- dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
- if (!dev_attr)
- return;
-
- if (ib_query_device(device, dev_attr)) {
- rdsdebug("Query device failed for %s\n", device->name);
- goto free_attr;
- }
-
rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
if (!rds_iwdev)
- goto free_attr;
+ return;
spin_lock_init(&rds_iwdev->spinlock);
- rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
- rds_iwdev->max_wrs = dev_attr->max_qp_wr;
- rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+ rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+ rds_iwdev->max_wrs = device->attrs.max_qp_wr;
+ rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
rds_iwdev->dev = device;
rds_iwdev->pd = ib_alloc_pd(device);
@@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device)
list_add_tail(&rds_iwdev->list, &rds_iw_devices);
ib_set_client_data(device, &rds_iw_client, rds_iwdev);
-
- goto free_attr;
+ return;
err_mr:
if (rds_iwdev->mr)
@@ -121,8 +110,6 @@ err_pd:
ib_dealloc_pd(rds_iwdev->pd);
free_dev:
kfree(rds_iwdev);
-free_attr:
- kfree(dev_attr);
}
static void rds_iw_remove_one(struct ib_device *device, void *client_data)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e98f4a..37edea6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt)
if (atomic_dec_and_test(&xprt->count))
xprt_destroy(xprt);
}
+EXPORT_SYMBOL_GPL(xprt_put);
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 33f99d3..dc9f3b5 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o physical_ops.o \
- svc_rdma.o svc_rdma_transport.o \
+ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c6836844..e165673 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -190,12 +190,11 @@ static int
frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
- struct ib_device_attr *devattr = &ia->ri_devattr;
int depth, delta;
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- devattr->max_fast_reg_page_list_len);
+ ia->ri_device->attrs.max_fast_reg_page_list_len);
dprintk("RPC: %s: device's max FR page list len = %u\n",
__func__, ia->ri_max_frmr_depth);
@@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
}
ep->rep_attr.cap.max_send_wr *= depth;
- if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
- cdata->max_requests = devattr->max_qp_wr / depth;
+ if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
+ cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests *
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 1b7051b..c846ca9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD;
static unsigned int min_ord = 1;
static unsigned int max_ord = 4096;
unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
@@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod;
atomic_t rdma_stat_sq_poll;
atomic_t rdma_stat_sq_prod;
-/* Temporary NFS request map and context caches */
-struct kmem_cache *svc_rdma_map_cachep;
-struct kmem_cache *svc_rdma_ctxt_cachep;
-
struct workqueue_struct *svc_rdma_wq;
/*
@@ -243,17 +240,16 @@ void svc_rdma_cleanup(void)
svc_unreg_xprt_class(&svc_rdma_bc_class);
#endif
svc_unreg_xprt_class(&svc_rdma_class);
- kmem_cache_destroy(svc_rdma_map_cachep);
- kmem_cache_destroy(svc_rdma_ctxt_cachep);
}
int svc_rdma_init(void)
{
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
- dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
- dprintk("\tsq_depth : %d\n",
+ dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
+ dprintk("\tsq_depth : %u\n",
svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+ dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
@@ -264,39 +260,10 @@ int svc_rdma_init(void)
svcrdma_table_header =
register_sysctl_table(svcrdma_root_table);
- /* Create the temporary map cache */
- svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
- sizeof(struct svc_rdma_req_map),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!svc_rdma_map_cachep) {
- printk(KERN_INFO "Could not allocate map cache.\n");
- goto err0;
- }
-
- /* Create the temporary context cache */
- svc_rdma_ctxt_cachep =
- kmem_cache_create("svc_rdma_ctxt_cache",
- sizeof(struct svc_rdma_op_ctxt),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!svc_rdma_ctxt_cachep) {
- printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
- goto err1;
- }
-
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
svc_reg_xprt_class(&svc_rdma_bc_class);
#endif
return 0;
- err1:
- kmem_cache_destroy(svc_rdma_map_cachep);
- err0:
- unregister_sysctl_table(svcrdma_table_header);
- destroy_workqueue(svc_rdma_wq);
- return -ENOMEM;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644
index 0000000..65a7c23
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA (server-side).
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+#undef SVCRDMA_BACKCHANNEL_DEBUG
+
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+ struct xdr_buf *rcvbuf)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct kvec *dst, *src = &rcvbuf->head[0];
+ struct rpc_rqst *req;
+ unsigned long cwnd;
+ u32 credits;
+ size_t len;
+ __be32 xid;
+ __be32 *p;
+ int ret;
+
+ p = (__be32 *)src->iov_base;
+ len = src->iov_len;
+ xid = rmsgp->rm_xid;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+ pr_info("%s: xid=%08x, length=%zu\n",
+ __func__, be32_to_cpu(xid), len);
+ pr_info("%s: RPC/RDMA: %*ph\n",
+ __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+ pr_info("%s: RPC: %*ph\n",
+ __func__, (int)len, p);
+#endif
+
+ ret = -EAGAIN;
+ if (src->iov_len < 24)
+ goto out_shortreply;
+
+ spin_lock_bh(&xprt->transport_lock);
+ req = xprt_lookup_rqst(xprt, xid);
+ if (!req)
+ goto out_notfound;
+
+ dst = &req->rq_private_buf.head[0];
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+ if (dst->iov_len < len)
+ goto out_unlock;
+ memcpy(dst->iov_base, p, len);
+
+ credits = be32_to_cpu(rmsgp->rm_credit);
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
+ credits = r_xprt->rx_buf.rb_bc_max_requests;
+
+ cwnd = xprt->cwnd;
+ xprt->cwnd = credits << RPC_CWNDSHIFT;
+ if (xprt->cwnd > cwnd)
+ xprt_release_rqst_cong(req->rq_task);
+
+ ret = 0;
+ xprt_complete_rqst(req->rq_task, rcvbuf->len);
+ rcvbuf->len = 0;
+
+out_unlock:
+ spin_unlock_bh(&xprt->transport_lock);
+out:
+ return ret;
+
+out_shortreply:
+ dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
+ xprt, src->iov_len);
+ goto out;
+
+out_notfound:
+ dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
+ xprt, be32_to_cpu(xid));
+
+ goto out_unlock;
+}
+
+/* Send a backwards direction RPC call.
+ *
+ * Caller holds the connection's mutex and has already marshaled
+ * the RPC/RDMA request.
+ *
+ * This is similar to svc_rdma_reply, but takes an rpc_rqst
+ * instead, does not support chunks, and avoids blocking memory
+ * allocation.
+ *
+ * XXX: There is still an opportunity to block in svc_rdma_send()
+ * if there are no SQ entries to post the Send. This may occur if
+ * the adapter has a small maximum SQ depth.
+ */
+static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
+ struct svc_rdma_op_ctxt *ctxt;
+ struct svc_rdma_req_map *vec;
+ struct ib_send_wr send_wr;
+ int ret;
+
+ vec = svc_rdma_get_req_map(rdma);
+ ret = svc_rdma_map_xdr(rdma, sndbuf, vec);
+ if (ret)
+ goto out_err;
+
+ /* Post a recv buffer to handle the reply for this request. */
+ ret = svc_rdma_post_recv(rdma, GFP_NOIO);
+ if (ret) {
+ pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n",
+ ret);
+ pr_err("svcrdma: closing transport %p.\n", rdma);
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ ret = -ENOTCONN;
+ goto out_err;
+ }
+
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
+ ctxt->count = 1;
+
+ ctxt->wr_op = IB_WR_SEND;
+ ctxt->direction = DMA_TO_DEVICE;
+ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
+ ctxt->sge[0].length = sndbuf->len;
+ ctxt->sge[0].addr =
+ ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
+ sndbuf->len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+ atomic_inc(&rdma->sc_dma_used);
+
+ memset(&send_wr, 0, sizeof(send_wr));
+ send_wr.wr_id = (unsigned long)ctxt;
+ send_wr.sg_list = ctxt->sge;
+ send_wr.num_sge = 1;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = svc_rdma_send(rdma, &send_wr);
+ if (ret) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+
+out_err:
+ svc_rdma_put_req_map(rdma, vec);
+ dprintk("svcrdma: %s returns %d\n", __func__, ret);
+ return ret;
+
+out_unmap:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ goto out_err;
+}
+
+/* Server-side transport endpoint wants a whole page for its send
+ * buffer. The client RPC code constructs the RPC header in this
+ * buffer before it invokes ->send_request.
+ *
+ * Returns NULL if there was a temporary allocation failure.
+ */
+static void *
+xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+ struct svcxprt_rdma *rdma;
+ struct page *page;
+
+ rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+
+ /* Prevent an infinite loop: try to make this case work */
+ if (size > PAGE_SIZE)
+ WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
+ size);
+
+ page = alloc_page(RPCRDMA_DEF_GFP);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+static void
+xprt_rdma_bc_free(void *buffer)
+{
+ /* No-op: ctxt and page have already been freed. */
+}
+
+static int
+rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+ int rc;
+
+ /* Space in the send buffer for an RPC/RDMA header is reserved
+ * via xprt->tsh_size.
+ */
+ headerp->rm_xid = rqst->rq_xid;
+ headerp->rm_vers = rpcrdma_version;
+ headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+ headerp->rm_type = rdma_msg;
+ headerp->rm_body.rm_chunks[0] = xdr_zero;
+ headerp->rm_body.rm_chunks[1] = xdr_zero;
+ headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+ pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
+#endif
+
+ rc = svc_rdma_bc_sendto(rdma, rqst);
+ if (rc)
+ goto drop_connection;
+ return rc;
+
+drop_connection:
+ dprintk("svcrdma: failed to send bc call\n");
+ xprt_disconnect_done(xprt);
+ return -ENOTCONN;
+}
+
+/* Send an RPC call on the passive end of a transport
+ * connection.
+ */
+static int
+xprt_rdma_bc_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+ struct svcxprt_rdma *rdma;
+ int ret;
+
+ dprintk("svcrdma: sending bc call with xid: %08x\n",
+ be32_to_cpu(rqst->rq_xid));
+
+ if (!mutex_trylock(&sxprt->xpt_mutex)) {
+ rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
+ if (!mutex_trylock(&sxprt->xpt_mutex))
+ return -EAGAIN;
+ rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
+ }
+
+ ret = -ENOTCONN;
+ rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+ if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
+ ret = rpcrdma_bc_send_request(rdma, rqst);
+
+ mutex_unlock(&sxprt->xpt_mutex);
+
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static void
+xprt_rdma_bc_close(struct rpc_xprt *xprt)
+{
+ dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+}
+
+static void
+xprt_rdma_bc_put(struct rpc_xprt *xprt)
+{
+ dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+}
+
+static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+ .reserve_xprt = xprt_reserve_xprt_cong,
+ .release_xprt = xprt_release_xprt_cong,
+ .alloc_slot = xprt_alloc_slot,
+ .release_request = xprt_release_rqst_cong,
+ .buf_alloc = xprt_rdma_bc_allocate,
+ .buf_free = xprt_rdma_bc_free,
+ .send_request = xprt_rdma_bc_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .close = xprt_rdma_bc_close,
+ .destroy = xprt_rdma_bc_put,
+ .print_stats = xprt_rdma_print_stats
+};
+
+static const struct rpc_timeout xprt_rdma_bc_timeout = {
+ .to_initval = 60 * HZ,
+ .to_maxval = 60 * HZ,
+};
+
+/* It shouldn't matter if the number of backchannel session slots
+ * doesn't match the number of RPC/RDMA credits. That just means
+ * one or the other will have extra slots that aren't used.
+ */
+static struct rpc_xprt *
+xprt_setup_rdma_bc(struct xprt_create *args)
+{
+ struct rpc_xprt *xprt;
+ struct rpcrdma_xprt *new_xprt;
+
+ if (args->addrlen > sizeof(xprt->addr)) {
+ dprintk("RPC: %s: address too large\n", __func__);
+ return ERR_PTR(-EBADF);
+ }
+
+ xprt = xprt_alloc(args->net, sizeof(*new_xprt),
+ RPCRDMA_MAX_BC_REQUESTS,
+ RPCRDMA_MAX_BC_REQUESTS);
+ if (!xprt) {
+ dprintk("RPC: %s: couldn't allocate rpc_xprt\n",
+ __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ xprt->timeout = &xprt_rdma_bc_timeout;
+ xprt_set_bound(xprt);
+ xprt_set_connected(xprt);
+ xprt->bind_timeout = RPCRDMA_BIND_TO;
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+ xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+ xprt->prot = XPRT_TRANSPORT_BC_RDMA;
+ xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
+ xprt->ops = &xprt_rdma_bc_procs;
+
+ memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+ xprt->addrlen = args->addrlen;
+ xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
+ xprt->resvport = 0;
+
+ xprt->max_payload = xprt_rdma_max_inline_read;
+
+ new_xprt = rpcx_to_rdmax(xprt);
+ new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
+
+ xprt_get(xprt);
+ args->bc_xprt->xpt_bc_xprt = xprt;
+ xprt->bc_xprt = args->bc_xprt;
+
+ if (!try_module_get(THIS_MODULE))
+ goto out_fail;
+
+ /* Final put for backchannel xprt is in __svc_rdma_free */
+ xprt_get(xprt);
+ return xprt;
+
+out_fail:
+ xprt_rdma_free_addresses(xprt);
+ args->bc_xprt->xpt_bc_xprt = NULL;
+ xprt_put(xprt);
+ xprt_free(xprt);
+ return ERR_PTR(-EINVAL);
+}
+
+struct xprt_class xprt_rdma_bc = {
+ .list = LIST_HEAD_INIT(xprt_rdma_bc.list),
+ .name = "rdma backchannel",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_BC_RDMA,
+ .setup = xprt_setup_rdma_bc,
+};
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ff4f01e..c8b8a8b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
head->arg.page_len += len;
+
head->arg.len += len;
if (!pg_off)
head->count++;
@@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
goto err;
atomic_inc(&xprt->sc_dma_used);
- /* The lkey here is either a local dma lkey or a dma_mr lkey */
- ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->sge[pno].length = len;
ctxt->count++;
@@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
return ret;
}
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+{
+ __be32 *p = (__be32 *)rmsgp;
+
+ if (!xprt->xpt_bc_xprt)
+ return false;
+
+ if (rmsgp->rm_type != rdma_msg)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+ return false;
+
+ /* sanity */
+ if (p[7] != rmsgp->rm_xid)
+ return false;
+ /* call direction */
+ if (p[8] == cpu_to_be32(RPC_CALL))
+ return false;
+
+ return true;
+}
+
/*
* Set up the rqstp thread context to point to the RQ buffer. If
* necessary, pull additional data from the client with an RDMA_READ
@@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto close_out;
}
+ if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
+ ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+ &rqstp->rq_arg);
+ svc_rdma_put_context(ctxt, 0);
+ if (ret)
+ goto repost;
+ return ret;
+ }
+
/* Read read-list data. */
ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
if (ret > 0) {
@@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
set_bit(XPT_CLOSE, &xprt->xpt_flags);
defer:
return 0;
+
+repost:
+ ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL);
+ if (ret) {
+ pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
+ ret);
+ pr_err("svcrdma: closing transport %p.\n", rdma_xprt);
+ set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags);
+ ret = -ENOTCONN;
+ }
+ return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 969a1ab..df57f3c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -50,9 +50,9 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-static int map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec)
+int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
{
int sge_no;
u32 sge_bytes;
@@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
if (xdr->len !=
(xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
- pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+ pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
return -EIO;
}
@@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt,
sge_no++;
}
- dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+ dprintk("svcrdma: %s: sge_no %d page_no %d "
"page_base %u page_len %u head_len %zu tail_len %zu\n",
- sge_no, page_no, xdr->page_base, xdr->page_len,
+ __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
xdr->head[0].iov_len, xdr->tail[0].iov_len);
vec->count = sge_no;
@@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge[sge_no].addr))
goto err;
atomic_inc(&xprt->sc_dma_used);
- sge[sge_no].lkey = xprt->sc_dma_lkey;
+ sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count++;
sge_off = 0;
sge_no++;
@@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
int ret;
/* Post a recv buffer to handle another request. */
- ret = svc_rdma_post_recv(rdma);
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
if (ret) {
printk(KERN_INFO
"svcrdma: could not post a receive buffer, err=%d."
@@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->count = 1;
/* Prepare the SGE for the RPCRDMA Header */
- ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
@@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[sge_no].addr))
goto err;
atomic_inc(&rdma->sc_dma_used);
- ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
if (byte_count != 0) {
@@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
/* Build an req vec for the XDR */
ctxt = svc_rdma_get_context(rdma);
ctxt->direction = DMA_TO_DEVICE;
- vec = svc_rdma_get_req_map();
- ret = map_xdr(rdma, &rqstp->rq_res, vec);
+ vec = svc_rdma_get_req_map(rdma);
+ ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec);
if (ret)
goto err0;
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
- res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ ret = -ENOMEM;
+ res_page = alloc_page(GFP_KERNEL);
+ if (!res_page)
+ goto err0;
rdma_resp = page_address(res_page);
reply_ary = svc_rdma_get_reply_array(rdma_argp);
if (reply_ary)
@@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
inline_bytes);
- svc_rdma_put_req_map(vec);
+ svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
return ret;
err1:
put_page(res_page);
err0:
- svc_rdma_put_req_map(vec);
+ svc_rdma_put_req_map(rdma, vec);
svc_rdma_put_context(ctxt, 0);
return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b348b4a..5763825 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt)
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
+ gfp_t flags)
{
struct svc_rdma_op_ctxt *ctxt;
- ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
- GFP_KERNEL | __GFP_NOFAIL);
- ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->dto_q);
+ ctxt = kmalloc(sizeof(*ctxt), flags);
+ if (ctxt) {
+ ctxt->xprt = xprt;
+ INIT_LIST_HEAD(&ctxt->free);
+ INIT_LIST_HEAD(&ctxt->dto_q);
+ }
+ return ctxt;
+}
+
+static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
+{
+ unsigned int i;
+
+ /* Each RPC/RDMA credit can consume a number of send
+ * and receive WQEs. One ctxt is allocated for each.
+ */
+ i = xprt->sc_sq_depth + xprt->sc_rq_depth;
+
+ while (i--) {
+ struct svc_rdma_op_ctxt *ctxt;
+
+ ctxt = alloc_ctxt(xprt, GFP_KERNEL);
+ if (!ctxt) {
+ dprintk("svcrdma: No memory for RDMA ctxt\n");
+ return false;
+ }
+ list_add(&ctxt->free, &xprt->sc_ctxts);
+ }
+ return true;
+}
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used++;
+ if (list_empty(&xprt->sc_ctxts))
+ goto out_empty;
+
+ ctxt = list_first_entry(&xprt->sc_ctxts,
+ struct svc_rdma_op_ctxt, free);
+ list_del_init(&ctxt->free);
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+out:
ctxt->count = 0;
ctxt->frmr = NULL;
- atomic_inc(&xprt->sc_ctxt_used);
return ctxt;
+
+out_empty:
+ /* Either pre-allocation missed the mark, or send
+ * queue accounting is broken.
+ */
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+ ctxt = alloc_ctxt(xprt, GFP_NOIO);
+ if (ctxt)
+ goto out;
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used--;
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+ WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
+ return NULL;
}
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
@@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
- * the sc_dma_lkey, otherwise, ignore it since it is
+ * the local_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
- if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+ if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
atomic_dec(&xprt->sc_dma_used);
ib_dma_unmap_page(xprt->sc_cm_id->device,
ctxt->sge[i].addr,
@@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
{
- struct svcxprt_rdma *xprt;
+ struct svcxprt_rdma *xprt = ctxt->xprt;
int i;
- xprt = ctxt->xprt;
if (free_pages)
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
- atomic_dec(&xprt->sc_ctxt_used);
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used--;
+ list_add(&ctxt->free, &xprt->sc_ctxts);
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
}
-/*
- * Temporary NFS req mappings are shared across all transport
- * instances. These are short lived and should be bounded by the number
- * of concurrent server threads * depth of the SQ.
- */
-struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
+{
+ while (!list_empty(&xprt->sc_ctxts)) {
+ struct svc_rdma_op_ctxt *ctxt;
+
+ ctxt = list_first_entry(&xprt->sc_ctxts,
+ struct svc_rdma_op_ctxt, free);
+ list_del(&ctxt->free);
+ kfree(ctxt);
+ }
+}
+
+static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
{
struct svc_rdma_req_map *map;
- map = kmem_cache_alloc(svc_rdma_map_cachep,
- GFP_KERNEL | __GFP_NOFAIL);
+
+ map = kmalloc(sizeof(*map), flags);
+ if (map)
+ INIT_LIST_HEAD(&map->free);
+ return map;
+}
+
+static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
+{
+ unsigned int i;
+
+ /* One for each receive buffer on this connection. */
+ i = xprt->sc_max_requests;
+
+ while (i--) {
+ struct svc_rdma_req_map *map;
+
+ map = alloc_req_map(GFP_KERNEL);
+ if (!map) {
+ dprintk("svcrdma: No memory for request map\n");
+ return false;
+ }
+ list_add(&map->free, &xprt->sc_maps);
+ }
+ return true;
+}
+
+struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_req_map *map = NULL;
+
+ spin_lock(&xprt->sc_map_lock);
+ if (list_empty(&xprt->sc_maps))
+ goto out_empty;
+
+ map = list_first_entry(&xprt->sc_maps,
+ struct svc_rdma_req_map, free);
+ list_del_init(&map->free);
+ spin_unlock(&xprt->sc_map_lock);
+
+out:
map->count = 0;
return map;
+
+out_empty:
+ spin_unlock(&xprt->sc_map_lock);
+
+ /* Pre-allocation amount was incorrect */
+ map = alloc_req_map(GFP_NOIO);
+ if (map)
+ goto out;
+
+ WARN_ONCE(1, "svcrdma: empty request map list?\n");
+ return NULL;
+}
+
+void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
+ struct svc_rdma_req_map *map)
+{
+ spin_lock(&xprt->sc_map_lock);
+ list_add(&map->free, &xprt->sc_maps);
+ spin_unlock(&xprt->sc_map_lock);
}
-void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
{
- kmem_cache_free(svc_rdma_map_cachep, map);
+ while (!list_empty(&xprt->sc_maps)) {
+ struct svc_rdma_req_map *map;
+
+ map = list_first_entry(&xprt->sc_maps,
+ struct svc_rdma_req_map, free);
+ list_del(&map->free);
+ kfree(map);
+ }
}
/* ib_cq event handler */
@@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
static void process_context(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *ctxt)
{
+ struct svc_rdma_op_ctxt *read_hdr;
+ int free_pages = 0;
+
svc_rdma_unmap_dma(ctxt);
switch (ctxt->wr_op) {
case IB_WR_SEND:
- if (ctxt->frmr)
- pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
- svc_rdma_put_context(ctxt, 1);
+ free_pages = 1;
break;
case IB_WR_RDMA_WRITE:
- if (ctxt->frmr)
- pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
- svc_rdma_put_context(ctxt, 0);
break;
case IB_WR_RDMA_READ:
case IB_WR_RDMA_READ_WITH_INV:
svc_rdma_put_frmr(xprt, ctxt->frmr);
- if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
- struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
- if (read_hdr) {
- spin_lock_bh(&xprt->sc_rq_dto_lock);
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- list_add_tail(&read_hdr->dto_q,
- &xprt->sc_read_complete_q);
- spin_unlock_bh(&xprt->sc_rq_dto_lock);
- } else {
- pr_err("svcrdma: ctxt->read_hdr == NULL\n");
- }
- svc_xprt_enqueue(&xprt->sc_xprt);
- }
+
+ if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags))
+ break;
+
+ read_hdr = ctxt->read_hdr;
svc_rdma_put_context(ctxt, 0);
- break;
+
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ list_add_tail(&read_hdr->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ return;
default:
- printk(KERN_ERR "svcrdma: unexpected completion type, "
- "opcode=%d\n",
- ctxt->wr_op);
+ dprintk("svcrdma: unexpected completion opcode=%d\n",
+ ctxt->wr_op);
break;
}
+
+ svc_rdma_put_context(ctxt, free_pages);
}
/*
@@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+ INIT_LIST_HEAD(&cma_xprt->sc_maps);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
-
- cma_xprt->sc_ord = svcrdma_ord;
-
- cma_xprt->sc_max_req_size = svcrdma_max_req_size;
- cma_xprt->sc_max_requests = svcrdma_max_requests;
- cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
- atomic_set(&cma_xprt->sc_sq_count, 0);
- atomic_set(&cma_xprt->sc_ctxt_used, 0);
+ spin_lock_init(&cma_xprt->sc_ctxt_lock);
+ spin_lock_init(&cma_xprt->sc_map_lock);
if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
return cma_xprt;
}
-int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
{
struct ib_recv_wr recv_wr, *bad_recv_wr;
struct svc_rdma_op_ctxt *ctxt;
@@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err_put_ctxt;
}
- page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ page = alloc_page(flags);
+ if (!page)
+ goto err_put_ctxt;
ctxt->pages[sge_no] = page;
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
@@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
atomic_inc(&xprt->sc_dma_used);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
- ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count = sge_no + 1;
buflen += PAGE_SIZE;
}
@@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rdma_conn_param conn_param;
struct ib_cq_init_attr cq_attr = {};
struct ib_qp_init_attr qp_attr;
- struct ib_device_attr devattr;
- int uninitialized_var(dma_mr_acc);
- int need_dma_mr = 0;
- int ret;
- int i;
+ struct ib_device *dev;
+ unsigned int i;
+ int ret = 0;
listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
clear_bit(XPT_CONN, &xprt->xpt_flags);
@@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
newxprt, newxprt->sc_cm_id);
- ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
- if (ret) {
- dprintk("svcrdma: could not query device attributes on "
- "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
- goto errout;
- }
+ dev = newxprt->sc_cm_id->device;
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
- newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+ newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
(size_t)RPCSVC_MAXPAGES);
- newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+ newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
RPCSVC_MAXPAGES);
- newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
- (size_t)svcrdma_max_requests);
- newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+ newxprt->sc_max_req_size = svcrdma_max_req_size;
+ newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
+ svcrdma_max_requests);
+ newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
+ svcrdma_max_bc_requests);
+ newxprt->sc_rq_depth = newxprt->sc_max_requests +
+ newxprt->sc_max_bc_requests;
+ newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+
+ if (!svc_rdma_prealloc_ctxts(newxprt))
+ goto errout;
+ if (!svc_rdma_prealloc_maps(newxprt))
+ goto errout;
/*
* Limit ORD based on client limit, local device limit, and
* configured svcrdma limit.
*/
- newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+ newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
- newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+ newxprt->sc_pd = ib_alloc_pd(dev);
if (IS_ERR(newxprt->sc_pd)) {
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
}
cq_attr.cqe = newxprt->sc_sq_depth;
- newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ newxprt->sc_sq_cq = ib_create_cq(dev,
sq_comp_handler,
cq_event_handler,
newxprt,
@@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
- cq_attr.cqe = newxprt->sc_max_requests;
- newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ cq_attr.cqe = newxprt->sc_rq_depth;
+ newxprt->sc_rq_cq = ib_create_cq(dev,
rq_comp_handler,
cq_event_handler,
newxprt,
@@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
- qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+ qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
" cap.max_send_sge = %d\n"
" cap.max_recv_sge = %d\n",
newxprt->sc_cm_id, newxprt->sc_pd,
- newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+ dev, newxprt->sc_pd->device,
qp_attr.cap.max_send_wr,
qp_attr.cap.max_recv_wr,
qp_attr.cap.max_send_sge,
@@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
* of an RDMA_READ. IB does not.
*/
newxprt->sc_reader = rdma_read_chunk_lcl;
- if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
newxprt->sc_frmr_pg_list_len =
- devattr.max_fast_reg_page_list_len;
+ dev->attrs.max_fast_reg_page_list_len;
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
newxprt->sc_reader = rdma_read_chunk_frmr;
}
@@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/*
* Determine if a DMA MR is required and if so, what privs are required
*/
- if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num) &&
- !rdma_ib_or_roce(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num))
+ if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
+ !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
goto errout;
- if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
- !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num) &&
- !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
- dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
- }
-
- if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num))
+ if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
- /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
- if (need_dma_mr) {
- /* Register all of physical memory */
- newxprt->sc_phys_mr =
- ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
- if (IS_ERR(newxprt->sc_phys_mr)) {
- dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
- ret);
- goto errout;
- }
- newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
- } else
- newxprt->sc_dma_lkey =
- newxprt->sc_cm_id->device->local_dma_lkey;
-
/* Post receive buffers */
- for (i = 0; i < newxprt->sc_max_requests; i++) {
- ret = svc_rdma_post_recv(newxprt);
+ for (i = 0; i < newxprt->sc_rq_depth; i++) {
+ ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
if (ret) {
dprintk("svcrdma: failure posting receive buffers\n");
goto errout;
@@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work)
{
struct svcxprt_rdma *rdma =
container_of(work, struct svcxprt_rdma, sc_work);
- dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+
+ dprintk("svcrdma: %s(%p)\n", __func__, rdma);
/* We should only be called from kref_put */
- if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+ if (atomic_read(&xprt->xpt_ref.refcount) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
+ atomic_read(&xprt->xpt_ref.refcount));
/*
* Destroy queued, but not processed read completions. Note
@@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work)
}
/* Warn if we leaked a resource or under-referenced */
- if (atomic_read(&rdma->sc_ctxt_used) != 0)
+ if (rdma->sc_ctxt_used != 0)
pr_err("svcrdma: ctxt still in use? (%d)\n",
- atomic_read(&rdma->sc_ctxt_used));
+ rdma->sc_ctxt_used);
if (atomic_read(&rdma->sc_dma_used) != 0)
pr_err("svcrdma: dma still in use? (%d)\n",
atomic_read(&rdma->sc_dma_used));
- /* De-allocate fastreg mr */
+ /* Final put of backchannel client transport */
+ if (xprt->xpt_bc_xprt) {
+ xprt_put(xprt->xpt_bc_xprt);
+ xprt->xpt_bc_xprt = NULL;
+ }
+
rdma_dealloc_frmr_q(rdma);
+ svc_rdma_destroy_ctxts(rdma);
+ svc_rdma_destroy_maps(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work)
if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
ib_destroy_cq(rdma->sc_rq_cq);
- if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
- ib_dereg_mr(rdma->sc_phys_mr);
-
if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
ib_dealloc_pd(rdma->sc_pd);
@@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
int length;
int ret;
- p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ p = alloc_page(GFP_KERNEL);
+ if (!p)
+ return;
va = page_address(p);
/* XDR encode error */
@@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
return;
}
atomic_inc(&xprt->sc_dma_used);
- ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->sge[0].length = length;
/* Prepare SEND WR */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 740bddc..b1b009f 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -63,7 +63,7 @@
*/
static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
-static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
@@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = {
#endif
-#define RPCRDMA_BIND_TO (60U * HZ)
-#define RPCRDMA_INIT_REEST_TO (5U * HZ)
-#define RPCRDMA_MAX_REEST_TO (30U * HZ)
-#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
-
-static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
+static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */
static void
xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
}
-static void
+void
xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
char buf[128];
@@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
}
-static void
+void
xprt_rdma_free_addresses(struct rpc_xprt *xprt)
{
unsigned int i;
@@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
if (req == NULL)
return NULL;
- flags = GFP_NOIO | __GFP_NOWARN;
+ flags = RPCRDMA_DEF_GFP;
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
@@ -642,7 +637,7 @@ drop_connection:
return -ENOTCONN; /* implies disconnect */
}
-static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
long idle_time = 0;
@@ -743,6 +738,11 @@ void xprt_rdma_cleanup(void)
rpcrdma_destroy_wq();
frwr_destroy_recovery_wq();
+
+ rc = xprt_unregister_transport(&xprt_rdma_bc);
+ if (rc)
+ dprintk("RPC: %s: xprt_unregister(bc) returned %i\n",
+ __func__, rc);
}
int xprt_rdma_init(void)
@@ -766,6 +766,14 @@ int xprt_rdma_init(void)
return rc;
}
+ rc = xprt_register_transport(&xprt_rdma_bc);
+ if (rc) {
+ xprt_unregister_transport(&xprt_rdma);
+ rpcrdma_destroy_wq();
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
dprintk("Defaults:\n");
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 732c71c..878f1bf 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -462,7 +462,6 @@ int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
struct rpcrdma_ia *ia = &xprt->rx_ia;
- struct ib_device_attr *devattr = &ia->ri_devattr;
int rc;
ia->ri_dma_mr = NULL;
@@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
goto out2;
}
- rc = ib_query_device(ia->ri_device, devattr);
- if (rc) {
- dprintk("RPC: %s: ib_query_device failed %d\n",
- __func__, rc);
- goto out3;
- }
-
if (memreg == RPCRDMA_FRMR) {
- if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
- (devattr->max_fast_reg_page_list_len == 0)) {
+ if (!(ia->ri_device->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+ (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
dprintk("RPC: %s: FRMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_MTHCAFMR;
@@ -566,24 +559,23 @@ int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
- struct ib_device_attr *devattr = &ia->ri_devattr;
struct ib_cq *sendcq, *recvcq;
struct ib_cq_init_attr cq_attr = {};
unsigned int max_qp_wr;
int rc, err;
- if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+ if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
dprintk("RPC: %s: insufficient sge's available\n",
__func__);
return -ENOMEM;
}
- if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+ if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
dprintk("RPC: %s: insufficient wqe's available\n",
__func__);
return -ENOMEM;
}
- max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS;
+ max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
/* check provider's send/recv wr limits */
if (cdata->max_requests > max_qp_wr)
@@ -668,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
- if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
ep->rep_remote_cma.responder_resources = 32;
else
ep->rep_remote_cma.responder_resources =
- devattr->max_qp_rd_atom;
+ ia->ri_device->attrs.max_qp_rd_atom;
ep->rep_remote_cma.retry_count = 7;
ep->rep_remote_cma.flow_control = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 728101d..38fe11b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -55,6 +55,11 @@
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
+#define RPCRDMA_BIND_TO (60U * HZ)
+#define RPCRDMA_INIT_REEST_TO (5U * HZ)
+#define RPCRDMA_MAX_REEST_TO (30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
+
/*
* Interface Adapter -- one per transport instance
*/
@@ -68,7 +73,6 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
- struct ib_device_attr ri_devattr;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -142,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
return (struct rpcrdma_msg *)rb->rg_base;
}
+#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -309,6 +315,8 @@ struct rpcrdma_buffer {
u32 rb_bc_srv_max_requests;
spinlock_t rb_reqslock; /* protect rb_allreqs */
struct list_head rb_allreqs;
+
+ u32 rb_bc_max_requests;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -516,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
+extern unsigned int xprt_rdma_max_inline_read;
+void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
+void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
@@ -531,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-/* Temporary NFS request map cache. Created in svc_rdma.c */
-extern struct kmem_cache *svc_rdma_map_cachep;
-/* WR context cache. Created in svc_rdma.c */
-extern struct kmem_cache *svc_rdma_ctxt_cachep;
-/* Workqueue created in svc_rdma.c */
-extern struct workqueue_struct *svc_rdma_wq;
+extern struct xprt_class xprt_rdma_bc;
#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index ea69ce3..c3bd294 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -3746,7 +3746,7 @@ static const struct flag flags[] = {
{ "NET_TX_SOFTIRQ", 2 },
{ "NET_RX_SOFTIRQ", 3 },
{ "BLOCK_SOFTIRQ", 4 },
- { "BLOCK_IOPOLL_SOFTIRQ", 5 },
+ { "IRQ_POLL_SOFTIRQ", 5 },
{ "TASKLET_SOFTIRQ", 6 },
{ "SCHED_SOFTIRQ", 7 },
{ "HRTIMER_SOFTIRQ", 8 },
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 8ff7d62..33b52ea 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -209,7 +209,7 @@ static const struct flag flags[] = {
{ "NET_TX_SOFTIRQ", 2 },
{ "NET_RX_SOFTIRQ", 3 },
{ "BLOCK_SOFTIRQ", 4 },
- { "BLOCK_IOPOLL_SOFTIRQ", 5 },
+ { "IRQ_POLL_SOFTIRQ", 5 },
{ "TASKLET_SOFTIRQ", 6 },
{ "SCHED_SOFTIRQ", 7 },
{ "HRTIMER_SOFTIRQ", 8 },
OpenPOWER on IntegriCloud